def __init__(self, n_hidden, n_iterations=3000, learning_rate=0.01): self.n_hidden = n_hidden self.n_iterations = n_iterations self.learning_rate = learning_rate self.hidden_activation = Sigmoid() self.output_activation = Softmax() self.loss = CrossEntropy()
def test_returns_jacobian_matrix_of_valid_shape(self): z = np.array([1, 2, -2], float) j = Softmax.gradient(z) self.assertTupleEqual(j.shape, (3, 3)) z = np.array([1, 2], float) j = Softmax.gradient(z) self.assertTupleEqual(j.shape, (2, 2))
def test_derivatives_with_different_indices_in_jacobian_matrix(self): z = np.array([1, -1.5], float) j = Softmax.gradient(z) s = Softmax.activation(z) self.assertEqual(j[0, 1], s[0] * s[1]) s = Softmax.activation(z) self.assertEqual(j[1, 0], s[1] * s[0])
def test_get_final_layer_error_for_arrays(self): cross_entropy = cost_functions.CrossEntropyCost(self.net) z_last = np.array([3, -1], float) z_last_prime = Softmax.gradient(z_last) y = np.array([0, 0.5], float) a_last = Softmax.activation(z_last) nabla = cross_entropy.get_final_layer_error(a_last, y, z_last_prime) self.assertAlmostEqual(nabla[0], a_last[0] - y[0], places=2) self.assertAlmostEqual(nabla[1], a_last[1] - y[1], places=2)
def test_for_2_element_vectors(self): z = np.array([1, 2], float) a = Softmax.activation(z) self.assertTrue(np.allclose( a, np.array([0.268941, 0.731058], float), )) z = np.array([0, 2], float) a = Softmax.activation(z) self.assertTrue( np.allclose( a, np.array([0.1192029, 0.880797], float), ))
def _forward_pass(self, x, third_layer_activation): ''' Performs forward pass of the network. a_i - results of applying weights for the data from precious layear z_i - result of activation We save the results for further backward pass. :param x: input data :param third_layer_activation: which activation to apply to third layer :return: multi-class prediction (n_class, 1) ''' self.x = x self.a_1 = self.w1.dot(x) + self.b1 self.z_1 = Tanh.activation(self.a_1) self.a_2 = self.w2.dot(self.z_1) + self.b2 self.z_2 = Relu.activation(self.a_2) self.z_2_with_skip_connection = self.z_2 + self.w_s.dot(x) self.a_3 = self.w_out.dot(self.z_2_with_skip_connection) + self.b_out if third_layer_activation == 'Softmax': self.y_pred = Softmax.activation(self.a_3) elif third_layer_activation == 'Tanh': self.y_pred = Tanh.activation(self.a_3) else: raise ValueError("Unknown activation type for 3rd layer")
class Activation_SoftMax(Layer): """A layer that applies an activation operation to the input. Parameters: ----------- name: string The name of the activation function that will be used. """ def __init__(self, input_shape=None): self.layer_name = 'softmax' self.input_shape = input_shape self.activation_func = Softmax() self.trainable = False def initialize(self): # Just to set the output shape, but not needed below self.output_shape = self.input_shape def get_output_shape(self): return self.output_shape def forward(self, Z, training=True): self.layer_input = Z return self.activation_func(Z) def backward(self, dA): Z = self.layer_input dact = self.activation_func.gradient(Z) #assert Z.shape == dact.shape dZ = np.sum(np.multiply(dA, dact), axis=1) assert (dZ.shape == (Z.shape)) return dZ
def predict(self, input_word_index, pause_duration=None): assert self.initialized, "initialize or load before using" self.t_lstm.predict(input_word_index, pause_duration, compute_only_features=True) self.m_tm1 = self.m self.h_tm1 = self.h r = np.dot(self.h_tm1, self.Wr) z1 = np.dot(self.t_lstm.h, self.W) if self.use_pauses: z1 += np.dot(pause_duration[:,np.newaxis], self.Wp) z = self.slice(r, self.hidden_size, 0) + self.slice(z1, self.hidden_size, 0) self.i = self.hidden_activation.y(z) z = self.slice(r, self.hidden_size, 1) + self.slice(z1, self.hidden_size, 1) + self.m_tm1 * self.Wip self.ig = Sigmoid.y(z) z = self.slice(r, self.hidden_size, 2) + self.slice(z1, self.hidden_size, 2) + self.m_tm1 * self.Wfp self.fg = Sigmoid.y(z) self.m = self.i * self.ig + self.m_tm1 * self.fg z = self.slice(r, self.hidden_size, 3) + self.slice(z1, self.hidden_size, 3) + self.m * self.Wop self.og = Sigmoid.y(z) self.z = self.hidden_activation.y(self.m) self.h = self.z * self.og z_y = np.dot(self.h, self.Wy) self.y = Softmax.y(z=z_y) self._remember_state(pause_duration)
def predict(self, input_word_index, pause_duration=None, compute_only_features=False): assert self.initialized, "initialize or load before using" self.m_tm1 = self.m self.h_tm1 = self.h r = np.dot(self.h_tm1, self.Wr) z = self.We[input_word_index] if self.use_pauses: z += np.dot(pause_duration[:, np.newaxis], self.Wp) self.x = self.hidden_activation.y(z) z1 = np.dot(self.x, self.W) z = self.slice(r, self.hidden_size, 0) + self.slice( z1, self.hidden_size, 0) self.i = self.hidden_activation.y(z) z = self.slice(r, self.hidden_size, 1) + self.slice( z1, self.hidden_size, 1) + self.m_tm1 * self.Wip self.ig = Sigmoid.y(z) z = self.slice(r, self.hidden_size, 2) + self.slice( z1, self.hidden_size, 2) + self.m_tm1 * self.Wfp self.fg = Sigmoid.y(z) self.m = self.i * self.ig + self.m_tm1 * self.fg z = self.slice(r, self.hidden_size, 3) + self.slice( z1, self.hidden_size, 3) + self.m * self.Wop self.og = Sigmoid.y(z) self.z = self.hidden_activation.y(self.m) self.h = self.z * self.og if not compute_only_features: z_y = np.dot(self.h, self.Wy) self.y = Softmax.y(z=z_y) if self.use_pauses: self._remember_state(input_word_index, pause_duration[:, np.newaxis]) else: self._remember_state(input_word_index)
class MultilayerPerceptron(): """Multilayer Perceptron classifier. A fully-connected neural network with one hidden layer. Unrolled to display the whole forward and backward pass. Parameters: ----------- n_hidden: int: The number of processing nodes (neurons) in the hidden layer. n_iterations: float The number of training iterations the algorithm will tune the weights for. learning_rate: float The step length that will be used when updating the weights. """ def __init__(self, n_hidden, n_iterations=3000, learning_rate=0.01): self.n_hidden = n_hidden self.n_iterations = n_iterations self.learning_rate = learning_rate self.hidden_activation = Sigmoid() self.output_activation = Softmax() self.loss = CrossEntropy() def _initialize_weights(self, X, y): n_samples, n_features = X.shape _, n_outputs = y.shape # Hidden layer limit = 1 / math.sqrt(n_features) self.W = np.random.uniform(-limit, limit, (n_features, self.n_hidden)) self.w0 = np.zeros((1, self.n_hidden)) # Output layer limit = 1 / math.sqrt(self.n_hidden) self.V = np.random.uniform(-limit, limit, (self.n_hidden, n_outputs)) self.v0 = np.zeros((1, n_outputs)) def fit(self, X, y): self._initialize_weights(X, y) for i in range(self.n_iterations): # .............. # Forward Pass # .............. # HIDDEN LAYER hidden_input = X.dot(self.W) + self.w0 #(1079*64)(64,16)+(1,16) -> (1079,16)+(1,16)->(1079,16) hidden_output = self.hidden_activation(hidden_input) # OUTPUT LAYER output_layer_input = hidden_output.dot(self.V) + self.v0 y_pred = self.output_activation(output_layer_input) # ............... # Backward Pass # ............... # OUTPUT LAYER # Grad. w.r.t input of output layer grad_wrt_out_l_input = self.loss.gradient(y, y_pred) * self.output_activation.gradient(output_layer_input) #(1079,10)(1079,10)->(1079,10) grad_v = hidden_output.T.dot(grad_wrt_out_l_input) # (16,1079)(1079,10)->(16,10) grad_v0 = np.sum(grad_wrt_out_l_input, axis=0, keepdims=True) # (1,10) # HIDDEN LAYER # Grad. w.r.t input of hidden layer # (1079,10) grad_wrt_hidden_l_input = grad_wrt_out_l_input.dot(self.V.T) * self.hidden_activation.gradient(hidden_input) grad_w = X.T.dot(grad_wrt_hidden_l_input) grad_w0 = np.sum(grad_wrt_hidden_l_input, axis=0, keepdims=True) # Update weights (by gradient descent) # Move against the gradient to minimize loss self.V -= self.learning_rate * grad_v self.v0 -= self.learning_rate * grad_v0 self.W -= self.learning_rate * grad_w self.w0 -= self.learning_rate * grad_w0 # Use the trained model to predict labels of X def predict(self, X): # Forward pass: hidden_input = X.dot(self.W) + self.w0 hidden_output = self.hidden_activation(hidden_input) output_layer_input = hidden_output.dot(self.V) + self.v0 y_pred = self.output_activation(output_layer_input) return y_pred
def __init__(self, input_shape=None): self.layer_name = 'softmax' self.input_shape = input_shape self.activation_func = Softmax() self.trainable = False
def test_results_add_to_1(self): z = np.array([-3, 0.1, 1, 20], float) a = Softmax.activation(z) self.assertAlmostEqual(a.sum(), 1)
def test_activations_in_correct_range(self): z = np.array([-1000, 0.1, 2, 200], float) a = Softmax.activation(z) self.assertTrue(np.all(0 <= a) and np.all(a <= 1))
def test_returns_array_of_valid_shape(self): z = np.array([1, 2], float) a = Softmax.activation(z) self.assertTupleEqual(a.shape, z.shape)
def test_on_vectors_with_huge_components(self): z = np.array([np.finfo(float).max, 2, np.finfo(float).max / 2], float) # won't raise OverflowError a = Softmax.activation(z)
import numpy as np import sys sys.path.append('../../network') from activation_functions import Softmax soft = Softmax() # Testing derivative test_matrix = np.random.rand(5,3) test_matrix.shape Delta= 0.000000001 displaced = np.zeros(test_matrix.shape) displaced[:,:] = test_matrix displaced[np.arange(0,1), :] =displaced[np.arange(0,1), :] + Delta ans = ((soft(displaced) - soft(test_matrix) )/Delta ) [:,:]- \ ( soft.gradient(test_matrix) )[:,0,:] < 0.0000001 print(ans) displaced = np.zeros(test_matrix.shape) displaced[:,:] = test_matrix displaced[np.arange(2,3), :] =displaced[np.arange(2,3), :] + Delta ans = ((soft(displaced) - soft(test_matrix) )/Delta ) [:,:]- \ ( soft.gradient(test_matrix) )[:,2,:] < 0.0000001 print(ans)