def forward(self, input_tensor, hidden_state): self.tanh = TanH() self.sigmoid = Sigmoid() # store variables which are needed in backward pass self.input_tensor = input_tensor self.input_h = hidden_state # forward propagation algorithm self.output_h = self.tanh.forward( np.dot(hidden_state, self.W_hh.T) + np.dot(input_tensor, self.W_xh.T) + self.B_h) output_tensor = self.sigmoid.forward( np.dot(self.output_h, self.W_hy.T) + self.B_y) return output_tensor, self.output_h
def __init__(self, input_size, hidden_size, output_size): super().__init__() self.input_size = input_size # 13 self.hidden_size = hidden_size # 7 self.output_size = output_size # 5 self.len_tbptt = 0 # batch size = 9 self.FC_h = FullyConnected(hidden_size + input_size, hidden_size) self.FC_y = FullyConnected(hidden_size, output_size) self.tan_h = TanH() self.h_t = None self.memory = False self.last_iter_h_t = None self.optimizer = None self.batch_size = None self.hidden_FC_mem = []
def __init__(self, weights_xh, bias_xh, weights_y, bias_y): self.k, self.H = weights_y.shape self.j = weights_xh.shape[1] - self.H # initialize trainable parameters # Weights of hidden_layer are Wf, Wi, Wc, W0, shape (H x H), 4 as shape (4H x H) # Weights of input_tensor are W1, W2, W3, W4, shape (H x j), 4 as shape (4H x j) # Combine all these weights together as shape (4H x (j + H)) # Combine input_tensor and hidden_state as shape (1 x (j + H)) # Weights of output_tensor is Wy, shape (k x H) self.w_xh = weights_xh # W_xh = [[W1, Wf], [W2, Wi], [W3, Wc], [W4, W0]] shape(4H x (j + H)) self.b_xh = bias_xh # (1 x 4H) self.w_y = weights_y # (k x H) self.b_y = bias_y # (1 x k) self.input_xh = None self.cell_state = None self.hidden_state = None self.tan = [] self.sig = [] # store parameter for backward self.f_t = None self.i_t = None self.c_hat_t = None self.o_t = None self.a_t = None self.cell_state = None self.con_tensor_xh = None self.out_hidden_state = None self.tan = [TanH() for _ in range(2)] self.sig = [Sigmoid() for _ in range(4)]
class RNN_cell: def __init__(self, W_xh, W_hh, W_hy, B_h, B_y): # W_xh:(H, J) W_hh: (H, H) W_hy: (K, H) B_h: (1, H) B_y: (1, K) self.W_xh, self.W_hh, self.W_hy, self.B_h, self.B_y = W_xh, W_hh, W_hy, B_h, B_y # Variables which are stored in forward pass for backward pass self.sigmoid = None # store tanh activation function self.tanh = None # store tanh activation function self.input_tensor = None # store input_tensor self.output_h = None # hidden state of current cell self.input_h = None # hidden state of last cell def forward(self, input_tensor, hidden_state): self.tanh = TanH() self.sigmoid = Sigmoid() # store variables which are needed in backward pass self.input_tensor = input_tensor self.input_h = hidden_state # forward propagation algorithm self.output_h = self.tanh.forward( np.dot(hidden_state, self.W_hh.T) + np.dot(input_tensor, self.W_xh.T) + self.B_h) output_tensor = self.sigmoid.forward( np.dot(self.output_h, self.W_hy.T) + self.B_y) return output_tensor, self.output_h def backward(self, error_tensor, hidden_error): error_tensor = self.sigmoid.backward(error_tensor) e_tmp = self.tanh.backward( np.dot(error_tensor, self.W_hy) + hidden_error) # error transferred over tanh hidden_error = np.dot(e_tmp, self.W_hh) output_error = np.dot(e_tmp, self.W_xh) grad_W_hy = np.dot(error_tensor.reshape(-1, 1), self.output_h.reshape(1, -1)) # grad_V (K, H) grad_B_y = error_tensor # (1, K) grad_W_hh = np.dot(e_tmp.reshape(-1, 1), self.input_h.reshape(1, -1)) # (H, H) grad_W_xh = np.dot(e_tmp.reshape(-1, 1), self.input_tensor.reshape(1, -1)) # (H, J) grad_B_h = e_tmp return output_error, hidden_error, grad_W_hy, grad_B_y, grad_W_hh, grad_W_xh, grad_B_h
def __init__(self, input_size, hidden_size, output_size): super().__init__() self.input_size = input_size self.hidden_size = hidden_size self.output_size = output_size # hidden layer & output layer self.fc_h = FullyConnected(self.input_size + self.hidden_size, self.hidden_size) self.fc_y = FullyConnected(self.hidden_size, self.output_size) self.sigmoid = Sigmoid() self.tanh = TanH() # output of hidden layer & output layer self.hidden_state = [] self.hidden_state.append(np.zeros(hidden_size)) # a vector! self.output = [] self._memorize = False self._gradient_weights = None self._optimizer = None
def forward(self, input_tensor, hidden_state, cell_state): """ This function realize the forward propagation of one LSTM-cell Args: input_tensor: x_t, input tensor of current slot. hidden_state: h_t-1, hidden state of previous LSTM-cell (time slot) cell_state: c_t-1, cell state of previous LSTM-cell (time slot) Returns: ndarray, output_tensor: y_t, output tensor of current slot. ndarray, next_h: h_t, hidden state of previous LSTM-cell, which will be transferred to next cell ndarray, next_c: c_t, cell state of previous LSTM-cell, which will be transferred to next cell """ # Data preparation. x = input_tensor.reshape(1, -1) # shape: (1,J) prev_h = hidden_state # shape: (1,H) prev_c = cell_state # shape: (1,H) _, H = hidden_state.shape # hidden size # initialize tanh and sigmoid functions self.sigmoid = [Sigmoid() for _ in range(4)] self.tanh = [TanH() for _ in range(2)] # forward propagation in LSTM-cell embedding = np.dot(x, self.W_xh.T) + np.dot( prev_h, self.W_hh.T) + self.B_h # (1,4H) f = self.sigmoid[0].forward(embedding[:, :H]) i = self.sigmoid[1].forward(embedding[:, H:2 * H]) c_hat = self.tanh[0].forward(embedding[:, 2 * H:3 * H]) o = self.sigmoid[2].forward(embedding[:, 3 * H:]) # calculation of new cell_state next_c = prev_c * f + i * c_hat # calculation of new hidden_state tanh_output = self.tanh[1].forward(next_c) next_h = o * tanh_output # calculation of output output_tensor = self.sigmoid[3].forward( np.dot(next_h, self.W_hy.T) + self.B_y) # return the variables which are needed in backward propagation self.cache = [ f, i, c_hat, o, x, prev_h, prev_c, tanh_output, next_h, next_c ] return output_tensor, next_h, next_c
class RNN(Base.Base_Layer): def __init__(self, input_size, hidden_size, output_size): super().__init__() self.input_size = input_size # 13 self.hidden_size = hidden_size # 7 self.output_size = output_size # 5 self.len_tbptt = 0 # batch size = 9 self.FC_h = FullyConnected(hidden_size + input_size, hidden_size) self.FC_y = FullyConnected(hidden_size, output_size) self.tan_h = TanH() self.h_t = None self.memory = False self.last_iter_h_t = None self.optimizer = None self.batch_size = None self.hidden_FC_mem = [] # whether the RNN regards subsequent sequences as a belonging to the same long sequence @property def memorize(self): return self.memory @memorize.setter def memorize(self, value): self.memory = value """ Implement a method forward(input tensor) which returns the input tensor for the next layer. Consider the ”batch” dimension as the ”time” dimension of a sequence over which the recurrence is performed. The first hidden state for this iteration is all zero if the boolean member variable is False, otherwise restore the hidden state from the last iteration. You can choose to compose parts of the RNN from other layers you already implemented. """ # input_tensor = (input_size, batch_size).T def forward(self, input_tensor): self.batch_size = input_tensor.shape[0] # prepare a matrix of the output so that no need of extra saving of vectors if self.memory: if self.h_t is None: self.h_t = np.zeros((self.batch_size + 1, self.hidden_size)) # (9+1, 7) else: print('********************') self.h_t[0] = self.last_iter_h_t else: # take previous value self.h_t = np.zeros((self.batch_size + 1, self.hidden_size)) y_t = np.zeros((self.batch_size, self.output_size)) # concatenating x,ht-1 and 1 to do forwarding to obtain new hidden state ht # 1: for t from 1 to T do: # 2: ut = W hh · h t − 1 + W xh · x t + b h --> h t = tanh (x̃ t · W h ) # 3: h t = tanh ( u t ) # 4: o t = W hy · h t + b y # 5: ŷ t = σ( o t ) #self.batch_size = 1 for batch in range(self.batch_size): # batch = time axis_h_t = self.h_t[batch][np.newaxis, :] # add row axis_input_t = input_tensor[batch][np.newaxis, :] new_input = np.concatenate((axis_h_t, axis_input_t), axis=1) # x̃_t #print(new_input.shape) self.hidden_FC_mem.append(new_input) # print(self.hidden_FC_mem) #if self.memory: # wt = self.FC_h.forward(self.hidden_FC_mem[batch - 1]) #else: wt = self.FC_h.forward(new_input) new_input = np.concatenate((np.expand_dims(self.h_t[batch], 0), np.expand_dims(input_tensor[batch], 0)), axis=1) self.h_t[batch+1] = self.tan_h.forward(wt) # h t = tanh (x̃ t · W h ) # o_t = W_hy · h_t + b_y ---> no need of sigmoid and bias added afterwards # ŷ_t = W_hy · h_t --> batch+1 = h_t and batch = h_t-1 y_t[batch] = (self.FC_y.forward(self.h_t[batch + 1][np.newaxis, :])) self.last_iter_h_t = self.h_t[-1] print(self.h_t.shape) #print(self.last_iter_h_t) self.input_tensor = input_tensor return y_t # Remember that optimizers are decoupled from our layers. def backward(self, error_tensor): #print('error_tensor', error_tensor.shape) self.error_tensor_out = np.zeros((self.batch_size, self.input_size)) hx_size = self.hidden_size + self.input_size # (20,7) steps = 1 self.gradient_weights_y = np.zeros((self.hidden_size+1, self.output_size)) self.gradient_weights_hx = np.zeros((hx_size+1, self.hidden_size)) #print(self.h_t.shape) gradient_tanh = 1 - self.h_t[1::] ** 2 error_h = np.zeros((1, self.hidden_size)) # backward # 1: for t from 1 to T do: # 2: Run RNN for one step, computing h_t and y_t # 3: if t mod k_1 == 0: # 4: Run BPTT from t down to t-k_2 for batch in reversed(range(self.batch_size)): one_batch_error = error_tensor[batch] error_y_h = self.FC_y.backward(one_batch_error[np.newaxis, :]) #print(error_y_h.shape) self.FC_y.input_tensor = np.hstack((self.h_t[batch+1], 1))[np.newaxis, :] gra_y_ht = error_h+error_y_h # print('ht,gradient_tanh', error_y_h.shape, error_h.shape, gra_y_ht.shape, gradient_tanh[batch].shape) gradient_hidden_t = gradient_tanh[batch]*gra_y_ht error_hx = self.FC_h.backward(gradient_hidden_t) error_h = error_hx[:, 0:self.hidden_size] # hidden error_x = error_hx[:, self.hidden_size:hx_size + 1] self.error_tensor_out[batch] = error_x concat = np.hstack((self.h_t[batch], self.input_tensor[batch], 1)) self.FC_h.input_tensor = concat[np.newaxis, :] print(steps, ' ', self.len_tbptt) #self.weights_y = self.FC_y.getter() # get_weights() if steps <= self.len_tbptt: self.weights_y = self.FC_y.getter() #get_weights() self.weights_h = self.FC_h.getter() #get_weights() self.gradient_weights() steps += 1 if self.optimizer is not None: self.weights_y = self.optimizer.calculate_update(self.weights_y, self.gradient_weights_y) self.weights_h = self.optimizer.calculate_update(self.weights_h, self.gradient_weights_hx) self.FC_y.setter(self.weights_y) # .set_weights(self.weights_y) self.FC_h.setter(self.weights_h) # .set_weights(self.weights_h) return self.error_tensor_out """ if the hidden state is computed with a single Fully Connected layer, which receives a stack of the hidden state and the input tensor, the weights of this particular Fully Connected Layer, are the weights considered to be weights for the whole class. In order to provide access to the weights of the RNN layer, implement a getter and a setter with a property for the weights member. """ @property def gradient_weights(self): return self.gradient_weights_hx #self.gradient_weights_y += self.FC_y.gradient_weights() # .get_gradient_weights() #self.gradient_weights_hx += self.FC_h.gradient_weights() # .get_gradient_weights() #return self.gradient_weights_hx """@property def weights(self): weights = self.FC_h.getter() # .get_weights() return weights @weights.setter def weights(self, weights): self.FC_h.setter(weights)""" def setter(self, optimizer): self._optimizer = copy.deepcopy(optimizer) def getter(self): return self._optimizer optimizer = property(getter, setter) def initialize(self, weights_initializer, bias_initializer): self.FC_y.initialize(weights_initializer, bias_initializer) self.FC_h.initialize(weights_initializer, bias_initializer)
class RNN(BaseLayer): def __init__(self, input_size, hidden_size, output_size): super().__init__() self.input_size = input_size self.hidden_size = hidden_size self.output_size = output_size # hidden layer & output layer self.fc_h = FullyConnected(self.input_size + self.hidden_size, self.hidden_size) self.fc_y = FullyConnected(self.hidden_size, self.output_size) self.sigmoid = Sigmoid() self.tanh = TanH() # output of hidden layer & output layer self.hidden_state = [] self.hidden_state.append(np.zeros(hidden_size)) # a vector! self.output = [] self._memorize = False self._gradient_weights = None self._optimizer = None @property def memorize(self): return self._memorize @memorize.setter def memorize(self, new_memorize): self._memorize = new_memorize @property def weights(self): return self.fc_h.weights @weights.setter def weights(self, new_weights): self.fc_h.weights = new_weights @property def gradient_weights(self): return self._gradient_weights @gradient_weights.setter def gradient_weights(self, new_grad_w): self._gradient_weights = new_grad_w @property def optimizer(self): return self._optimizer @optimizer.setter def optimizer(self, new_optimizer): self._optimizer = new_optimizer def forward(self, input_tensor): self.t = input_tensor.shape[0] for t in range(input_tensor.shape[0]): if not self.memorize and t > 0: self.hidden_state[t - 1] = np.zeros(self.hidden_size) # if memorize, last hidden state for t=0 of the second batch = last hidden state of the first batch if self.memorize and t == 0: self.hidden_state[0] = self.hidden_state[self.t - 1] print(t) # ht x_composed = np.hstack( (input_tensor[t], self.hidden_state[t - 1 if t > 0 else 0])) x_composed = np.atleast_2d(x_composed) # 2D print(x_composed) if t == 0: self.hidden_state[t] = self.tanh.forward( self.fc_h.forward(x_composed)) # 2D else: self.hidden_state.append( self.tanh.forward(self.fc_h.forward(x_composed))) # 2D self.hidden_state[t] = self.hidden_state[t].reshape( self.hidden_state[t].shape[1]) # 1D print(self.hidden_state[t]) # yt self.output.append( self.fc_y.forward(np.atleast_2d(self.hidden_state[t]))) # 2D self.output[t] = self.output[t].reshape( self.output[t].shape[1]) # 1D self.output[t] = self.sigmoid.forward(self.output[t]) result = np.array(self.output) # store activations for sigmoid & tanh layer self.sigmoid_activations = result self.tanh_activations = np.array(self.hidden_state) return result def backward(self, error_tensor): gradient_wy = None gradient_wh = None gradient_ht = np.zeros((self.t, self.hidden_size)) for t in range(error_tensor.shape[0] - 1, -1, -1): # gradient w.r.t. W_y # set activations for sigmoid layer self.sigmoid.activations = self.sigmoid_activations[t] gradient_ot = self.sigmoid.backward(error_tensor[t]) # gradient w.r.t. ht # t = 0 / T: only one part of sum if t == error_tensor.shape[0] - 1: gradient_ht[t] = self.fc_y.backward(gradient_ot) else: # set activations for tanh layer self.tanh.activations = self.tanh_activations[t + 1] gradient_ut = self.tanh.backward(gradient_ht[t + 1]) # decompose Wh: W_xh, W_hh wh = self.fc_h.backward(gradient_ut) if t == 0: gradient_ht[t] = wh[:, self.input_size:self.input_size + self.hidden_size] else: gradient_ht[t] = wh[:, self.input_size:self.input_size + self.hidden_size] + self.fc_y.backward( gradient_ot) # gradient w.r.t. W_y gradient_wy += self.fc_y.gradient_weights # gradient w.r.t W_h self.tanh.activations = self.tanh_activations[t] error = self.fc_h.backward(self.tanh.backward(gradient_ht[t])) gradient_wh += self.fc_h.gradient_weights # decompose Wh: W_xh, W_hh # gradient_whh = gradient_wh[self.input_size:self.input_size+self.hidden_size, :] # gradient_wxh = gradient_wh[0:self.input_size, :] return error def initialize(self, weights_initializer, bias_initializer): self.weights = weights_initializer.initialize( (self.input_size + self.hidden_size, self.output_size), self.input_size + self.hidden_size, self.output_size) self.bias = bias_initializer.initialize( self.output_size, self.input_size + self.hidden_size, self.output_size) self.weights = np.vstack((self.weights, self.bias))