forked from mbeissinger/odsc
-
Notifications
You must be signed in to change notification settings - Fork 0
/
mlp_mnist_theano.py
206 lines (172 loc) · 9.79 KB
/
mlp_mnist_theano.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
"""
This tutorial covers your simplest neural network: a multilayer perceptron (MLP)
Also known as feedforward neural network.
We will learn to classify MNIST handwritten digit images into their correct label (0-9).
"""
import cPickle as pickle
import gzip
from PIL import Image
from opendeep.utils.image import tile_raster_images
import theano
import theano.tensor as T
import numpy
import numpy.random as rng
if __name__ == '__main__':
# Load our data
# Download and unzip pickled version from here: http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz
################
# Explore data #
################
(train_x, train_y), (valid_x, valid_y), (test_x, test_y) = pickle.load(gzip.open('datasets/mnist.pkl.gz', 'rb'))
print "Shapes:"
print train_x.shape, train_y.shape
print valid_x.shape, valid_y.shape
print test_x.shape, test_y.shape
print "--------------"
print "Example input:"
print train_x[0]
print "Example label:"
print train_y[0]
# Show example images - using tile_raster_images helper function from OpenDeep to get 28x28 image from 784 array.
input_images = train_x[:25]
im = Image.fromarray(
tile_raster_images(input_images,
img_shape=(28, 28),
tile_shape=(1, 25),
tile_spacing=(1, 1))
)
im.save("example_mnist_numbers.png")
#########
# Model #
#########
# Cool, now we know a little about the input data, let's design the MLP to work with it!
# An MLP looks like this: input -> hiddens -> output classification
# Each stage is just a matrix multiplication with a nonlinear function applied after.
# Inputs are matrices where rows are examples and columns are pixels - so create a symbolic Theano matrix.
x = T.matrix('xs')
# Now let's start building the equation for our MLP!
# The first transformation is the input x -> hidden layer h.
# We defined this transformation with h = tanh(x.dot(W_x) + b_h)
# where the learnable model parameters are W_x and b_h.
# Therefore, we will need a weights matrix W_x and a bias vector b_h.
# W_x has shape (input_size, hidden_size) and b_h has shape (hidden_size,).
# Initialization is important in deep learning; we want something random so the model doesn't get stuck early.
# Many papers in this subject, but for now we will just use a normal distribution with mean=0 and std=0.05.
# Another good option for tanh layers is to use a uniform distribution with interval +- sqrt(6/sum(shape)).
# These are hyperparameters to play with.
# Bias starting as zero is fine.
W_x = numpy.asarray(rng.normal(loc=0.0, scale=.05, size=(28 * 28, 500)), dtype=theano.config.floatX)
b_h = numpy.zeros(shape=(500,), dtype=theano.config.floatX)
# To update a variable used in an equation (for example, while learning),
# Theano needs it to be in a special wrapper called a shared variable.
# These are the model parameters for our first hidden layer!
W_x = theano.shared(W_x, name="W_x")
b_h = theano.shared(b_h, name="b_h")
# Now, we can finally write the equation to give our symbolic hidden layer h!
h = T.tanh(
T.dot(x, W_x) + b_h
)
# Side note - if we used softmax instead of tanh for the activation, this would be performing logistic regression!
# We have the hidden layer h, let's put that softmax layer on top for classification output y!
# Same deal as before, the transformation is defined as:
# y = softmax(h.dot(W_h) + b_y)
# where the learnable parameters are W_h and b_y.
# W_h has shape (hidden_size, output_size) and b_y has shape (output_size,).
# We will use the same random initialization strategy as before.
W_h = numpy.asarray(rng.normal(loc=0.0, scale=.05, size=(500, 10)), dtype=theano.config.floatX)
b_y = numpy.zeros(shape=(10,), dtype=theano.config.floatX)
# Don't forget to make them shared variables!
W_h = theano.shared(W_h, name="W_h")
b_y = theano.shared(b_y, name="b_y")
# Now write the equation for the output!
y = T.nnet.softmax(
T.dot(h, W_h) + b_y
)
# The output (due to softmax) is a vector of class probabilities.
# To get the output class 'guess' from the model, just take the index of the highest probability!
y_hat = T.argmax(y, axis=1)
# That's everything! Just four model parameters and one input variable.
#################
# Optimization #
#################
# The variable y_hat represents the output of running our model, but we need a cost function to use for training.
# For a softmax (probability) output, we want to maximize the likelihood of P(Y=y|X).
# This means we want to minimize the negative log-likelihood cost! (For a primer, see machine learning Coursera.)
# Cost functions always need the truth outputs to compare against (this is supervised learning).
# From before, we saw the labels were a vector of ints - so let's make a symbolic variable for this!
correct_labels = T.lvector("labels") # integer vector
# Now we can compare our output probability from y with the true labels.
# Because the labels are integers, we will want to make an indexing mask to pick out the probabilities
# our model thought was the likelihood of the correct label.
log_likelihood = T.log(y)[T.arange(correct_labels.shape[0]), correct_labels]
# We use mean instead of sum to be less dependent on batch size (better for flexibility)
cost = -T.mean(log_likelihood)
# Easiest way to train neural nets is with Stochastic Gradient Descent
# This takes each example, calculates the gradient, and changes the model parameters a small amount
# in the direction of the gradient.
# Fancier add-ons to stochastic gradient descent will reduce the learning rate over time, add a momentum
# factor to the parameters, etc.
# Before we can start training, we need to know what the gradients are.
# Luckily we don't have to do any math! Theano has symbolic auto-differentiation which means it can
# calculate the gradients for arbitrary equations with respect to a cost and parameters.
parameters = [W_x, b_h, W_h, b_y]
gradients = T.grad(cost, parameters)
# Now gradients contains the list of derivatives: [d_cost/d_W_x, d_cost/d_b_h, d_cost/d_W_h, d_cost/d_b_y]
# One last thing we need to do before training is to use these gradients to update the parameters!
# Remember how parameters are shared variables? Well, Theano uses something called updates
# which are just pairs of (shared_variable, new_variable_expression) to change its value.
# So, let's create these updates to show how we change the parameter values during training with gradients!
# We use a learning rate to make small steps over time.
learning_rate = 0.01
train_updates = [(param, param - learning_rate * gradient) for param, gradient in zip(parameters, gradients)]
# Now we can create a Theano function that takes in real inputs and trains our model.
f_train = theano.function(inputs=[x, correct_labels], outputs=cost, updates=train_updates,
allow_input_downcast=True)
# For testing purposes, we don't want to use updates to change the parameters - so create a separate function!
# We also care more about the output guesses, so let's return those instead of the cost.
# error = sum(T.neq(y_hat, correct_labels))/float(y_hat.shape[0])
f_test = theano.function(inputs=[x], outputs=y_hat)
# Our training can begin!
# The two hyperparameters we have for this part are minibatch size (how many examples to process in parallel)
# and the total number of passes over all examples (epochs).
batch_size = 100
epochs = 30
# Given our batch size, compute how many batches we can fit into each data set
train_batches = len(train_x) / batch_size
valid_batches = len(valid_x) / batch_size
test_batches = len(test_x) / batch_size
# Our main training loop!
for epoch in range(epochs):
print epoch + 1, ":",
train_costs = []
train_accuracy = []
for i in range(train_batches):
# Grab our minibatch of examples from the whole train set.
batch_x = train_x[i * batch_size:(i + 1) * batch_size]
batch_labels = train_y[i * batch_size:(i + 1) * batch_size]
# Compute the costs from the train function (which also updates the parameters)
costs = f_train(batch_x, batch_labels)
# Compute the predictions from the test function (which does not update parameters)
preds = f_test(batch_x)
# Compute the accuracy of our predictions against the correct batch labels
acc = sum(preds == batch_labels) / float(len(batch_labels))
train_costs.append(costs)
train_accuracy.append(acc)
# Show the mean cost and accuracy across minibatches (the entire train set!)
print "cost:", numpy.mean(train_costs), "\ttrain:", str(numpy.mean(train_accuracy) * 100) + "%",
valid_accuracy = []
for i in range(valid_batches):
batch_x = valid_x[i * batch_size:(i + 1) * batch_size]
batch_labels = valid_y[i * batch_size:(i + 1) * batch_size]
preds = f_test(batch_x)
acc = sum(preds == batch_labels) / float(len(batch_labels))
valid_accuracy.append(acc)
print "\tvalid:", str(numpy.mean(valid_accuracy) * 100) + "%",
test_accuracy = []
for i in range(test_batches):
batch_x = test_x[i * batch_size:(i + 1) * batch_size]
batch_labels = test_y[i * batch_size:(i + 1) * batch_size]
preds = f_test(batch_x)
acc = sum(preds == batch_labels) / float(len(batch_labels))
test_accuracy.append(acc)
print "\ttest:", str(numpy.mean(test_accuracy) * 100) + "%"