def __init__(self, dim_list, eta = 0.1): """ Constructor for network. Params: dim_list: a list of the number of dimension for each layer. eta: learning rate for each gradient descent step """ depth = len(dim_list) self.depth = depth self.dim_list = dim_list self.eta = eta # 1. Initiate each layer: output, partial_output and weight, # although partial_output is useless for the input layer, similarly # weight and bias are useless for the output layer. # # 2. Partial_weight is an internal variable and will not be stored in # a layer. # self.layers = [ {'output':Vector.fromIterable(0 for i in xrange(dim_list[l])), 'partial_output':Vector.fromIterable(0 for i in xrange(dim_list[l])), 'weight':Matrix.fromRandom(dim_list[l + 1], dim_list[l]), 'bias':Vector.fromRandom(dim_list[l + 1])} for l in xrange(depth - 1) ] # output layer self.layers.append({'output':Vector.fromList([0] * dim_list[depth - 1]), 'partial_output':Vector.fromList([0] * dim_list[depth - 1]), 'weight': None, 'bias': None})
def _backward(self, x, y): # output layer layer_id = self.depth - 1 output = self.layers[layer_id]['output'] self.layers[layer_id]['partial_output'].assign(Vector.fromIterable( output[i] - y[i] for i in xrange(self.dim_list[layer_id]) )) loss = sum((output[i] - y[i]) ** 2 for i in xrange(self.dim_list[layer_id])) # hidden layer and input layer for layer_id in xrange(self.depth - 2, -1, -1): weight = self.layers[layer_id]['weight'] bias = self.layers[layer_id]['bias'] partial_output = self.layers[layer_id]['partial_output'] output = self.layers[layer_id]['output'] last_output = self.layers[layer_id + 1]['output'] last_partial = self.layers[layer_id + 1]['partial_output'] """ Partial output for every layer except the output one is: \frac {\partial E} {\partial O_k^{(l)}} = \sum_i (\frac {\partial E} {\partial O_i^{ (l+1) }} * O_i^{ (l+1) } * (1 - O_i^{ (l+1) }) * w_{ik}^{ (l) } ) But the partial output of the first layer is unnecessary, thus we don't compute it. """ if layer_id > 0: self.layers[layer_id]['partial_output'].assign(Vector.fromIterable( sum(last_partial[i] * last_output[i] * (1 - last_output[i]) * weight.item(i, k) for i in xrange(self.dim_list[layer_id + 1])) / (self.dim_list[layer_id] + 1.0) for k in xrange(self.dim_list[layer_id] ))) """ Partial weight for every layer except the output one: \frac {\partial E} {\partial w_{ji}^{(l)}} = \frac {\partial E} {\partial O_j^{(l + 1)}} * O_j^{(l + 1)} * (1 - O_j^{(l+1)}) * O_i^{(l)} """ partial_weight = Matrix.fromIterable(weight.row_num, weight.col_num, ( last_partial[row_id] * last_output[row_id] * (1 - last_output[row_id]) * output[col_id] / (self.dim_list[layer_id] + 1.0) for row_id in xrange(self.dim_list[layer_id + 1]) for col_id in xrange(self.dim_list[layer_id]) )) self.layers[layer_id]['weight'] -= self.eta * partial_weight """ Partial bias is almost exact as the partial weight, but for every item in the bias vector the last item is 1 \frac {\partial E}{\partial b_j^{(l)}} = \frac {\partial E}{\partial O_j^{(l + 1)}} O_j^{(l + 1)} (1 - O_j^{(l+1)}) * 1 """ partial_bias = Vector.fromIterable( last_partial[row_id] * last_output[row_id] * (1 - last_output[row_id]) * 1 / (self.dim_list[layer_id] + 1.0) for row_id in xrange(self.dim_list[layer_id + 1]) ) self.layers[layer_id]['bias'] -= self.eta * partial_bias return loss