def __call__(self, inputs, state, scope=None): """Gated recurrent unit (GRU) with nunits cells.""" with tf.variable_scope(scope or type(self).__name__): # "GRUCell" with tf.variable_scope("Gates"): # Reset gate and update gate. # We start with bias of 1.0 to not reset and not update. concated_r_u = layer_norm(linear([inputs, state], 2 * self._num_units, False, 1.0), num_variables_in_tensor=2, initial_bias_value=1.0) r, u = tf.split(axis=1, num_or_size_splits=2, value=tf.sigmoid(concated_r_u)) with tf.variable_scope("Candidate"): with tf.variable_scope("reset_portion"): reset_portion = r * layer_norm( linear([state], self._num_units, False)) with tf.variable_scope("inputs_portion"): inputs_portion = layer_norm( linear([inputs], self._num_units, False)) c = tf.tanh(reset_portion + inputs_portion) new_h = u * state + (1 - u) * c return new_h, new_h
def __call__(self, inputs, state, timestep = 0, scope=None): """Normal Gated recurrent unit (GRU) with nunits cells.""" with tf.variable_scope(scope or type(self).__name__): # "GRUCell" with tf.variable_scope("Inputs"): inputs_concat = linear([inputs], self._num_units*2, False, 1.0) inputs_concat = layer_norm(inputs_concat, num_variables_in_tensor = 2, initial_bias_value = 1.0) with tf.variable_scope("Hidden_State"): hidden_state_concat = linear([state], self._num_units*2, False) hidden_state_concat = layer_norm(hidden_state_concat, num_variables_in_tensor = 2) r, u = tf.split(1, 2, tf.sigmoid( multiplicative_integration([inputs_concat,hidden_state_concat], 2*self._num_units, 1.0, weights_already_calculated = True))) with tf.variable_scope("Candidate"): with tf.variable_scope('input_portion'): input_portion = layer_norm(linear([inputs], self._num_units, False)) with tf.variable_scope('reset_portion'): reset_portion = r * layer_norm(linear([state], self._num_units, False)) c = tf.tanh(multiplicative_integration([input_portion, reset_portion], self._num_units, 0.0, weights_already_calculated = True)) new_h = u * state + (1 - u) * c return new_h, new_h
def __call__(self, inputs, state, timestep=0, scope=None): current_state = state for highway_layer in xrange(self.num_highway_layers): with tf.variable_scope('highway_factor_' + str(highway_layer)): if self.use_inputs_on_each_layer or highway_layer == 0: highway_factor = tf.tanh( multiplicative_integration([inputs, current_state], self._num_units)) else: highway_factor = tf.tanh( layer_norm( linear([current_state], self._num_units, True))) with tf.variable_scope('gate_for_highway_factor_' + str(highway_layer)): if self.use_inputs_on_each_layer or highway_layer == 0: gate_for_highway_factor = tf.sigmoid( multiplicative_integration([inputs, current_state], self._num_units, initial_bias_value=-3.0)) else: gate_for_highway_factor = tf.sigmoid( linear([current_state], self._num_units, True, -3.0)) gate_for_hidden_factor = 1 - gate_for_highway_factor if self.use_recurrent_dropout and self.is_training: highway_factor = tf.nn.dropout( highway_factor, self.recurrent_dropout_factor) current_state = highway_factor * gate_for_highway_factor + current_state * gate_for_hidden_factor return current_state, current_state
def multiplicative_integration(list_of_inputs, output_size, initial_bias_value = 0.0, weights_already_calculated = False, use_highway_gate = False, use_l2_loss = False, scope = None, timestep = 0): '''expects len(2) for list of inputs and will perform integrative multiplication weights_already_calculated will treat the list of inputs as Wx and Uz and is useful for batch normed inputs ''' with tf.variable_scope(scope or 'double_inputs_multiple_integration'): if len(list_of_inputs) != 2: raise ValueError('list of inputs must be 2, you have:', len(list_of_inputs)) if weights_already_calculated: #if you already have weights you want to insert from batch norm Wx = list_of_inputs[0] Uz = list_of_inputs[1] else: with tf.variable_scope('Calculate_Wx_mulint'): Wx = linear.linear(list_of_inputs[0], output_size, False, use_l2_loss = use_l2_loss, timestep = timestep) with tf.variable_scope("Calculate_Uz_mulint"): Uz = linear.linear(list_of_inputs[1], output_size, False, use_l2_loss = use_l2_loss, timestep = timestep) with tf.variable_scope("multiplicative_integration"): alpha = tf.get_variable('mulint_alpha', [output_size], initializer = tf.truncated_normal_initializer(mean = 1.0, stddev = 0.1)) beta1, beta2 = tf.split(0,2, tf.get_variable('mulint_params_betas', [output_size*2], initializer = tf.truncated_normal_initializer(mean = 0.5, stddev = 0.1))) original_bias = tf.get_variable('mulint_original_bias', [output_size], initializer = tf.truncated_normal_initializer(mean = initial_bias_value, stddev = 0.1)) final_output = alpha*Wx*Uz + beta1*Uz + beta2*Wx + original_bias if use_highway_gate: final_output = highway_network.apply_highway_gate(final_output, list_of_inputs[0]) return final_output
def __call__(self, inputs, state, timestep=0, scope=None): with tf.variable_scope(scope or type(self).__name__): with tf.variable_scope( "Gates" ): # Forget Gate bias starts as 1.0 -- TODO: double check if this is correct if self.use_multiplicative_integration: gated_factor = multiplicative_integration( [inputs, state], self._num_units, self.forget_bias_initialization) else: gated_factor = linear([inputs, state], self._num_units, True, self.forget_bias_initialization) gated_factor = tf.sigmoid(gated_factor) with tf.variable_scope("Candidate"): c = tf.tanh(linear([inputs], self._num_units, True, 0.0)) if self.use_recurrent_dropout and self.is_training: input_contribution = tf.nn.dropout( c, self.recurrent_dropout_factor) else: input_contribution = c new_h = (1 - gated_factor) * state + gated_factor * input_contribution return new_h, new_h
def __call__(self, inputs, state, timestep = 0, scope=None): """Most basic RNN: output = new_state = tanh(W * input + U * state + B).""" current_state = state for highway_layer in xrange(self.num_highway_layers): with tf.variable_scope('highway_factor_'+str(highway_layer)): if self.use_inputs_on_each_layer or highway_layer == 0: highway_factor = tf.tanh(multiplicative_integration([inputs, current_state], self._num_units)) else: highway_factor = tf.tanh(linear([current_state], self._num_units, True)) with tf.variable_scope('gate_for_highway_factor_'+str(highway_layer)): if self.use_inputs_on_each_layer or highway_layer == 0: gate_for_highway_factor = tf.sigmoid(multiplicative_integration([inputs, current_state], self._num_units, initial_bias_value = -3.0)) else: gate_for_highway_factor = tf.sigmoid(linear([current_state], self._num_units, True, -3.0)) gate_for_hidden_factor = 1 - gate_for_highway_factor if self.use_recurrent_dropout and self.is_training: highway_factor = tf.nn.dropout(highway_factor, self.recurrent_dropout_factor) current_state = highway_factor * gate_for_highway_factor + current_state * gate_for_hidden_factor return current_state, current_state
def __call__(self, inputs, state, timestep=0, scope=None): current_state = state for highway_layer in xrange(self.num_highway_layers): with tf.variable_scope('highway_factor_' + str(highway_layer)): if self.use_inputs_on_each_layer or highway_layer == 0: highway_factor = tf.tanh( linear([inputs, current_state], self._num_units, True)) else: highway_factor = tf.tanh( linear([current_state], self._num_units, True)) with tf.variable_scope('gate_for_highway_factor_' + str(highway_layer)): if self.use_inputs_on_each_layer or highway_layer == 0: gate_for_highway_factor = tf.sigmoid( linear([inputs, current_state], self._num_units, True, -3.0)) else: gate_for_highway_factor = tf.sigmoid( linear([current_state], self._num_units, True, -3.0)) gate_for_hidden_factor = 1.0 - gate_for_highway_factor current_state = highway_factor * gate_for_highway_factor + current_state * gate_for_hidden_factor return current_state, current_state
def _inner_function(self, inputs, past_hidden_state, activation=tf.nn.tanh): """second order function as described equation 11 in delta rnn paper The main goal is to produce z_t of this function """ V_x_d = linear(past_hidden_state, self._num_units, True) # We make this a private variable to be reused in the _outer_function self._W_x_inputs = linear(inputs, self._num_units, True) alpha = tf.get_variable("alpha", [self._num_units], dtype=tf.float32, initializer=tf.ones_initializer()) beta_one = tf.get_variable("beta_one", [self._num_units], dtype=tf.float32, initializer=tf.ones_initializer()) beta_two = tf.get_variable("beta_two", [self._num_units], dtype=tf.float32, initializer=tf.ones_initializer()) z_t_bias = tf.get_variable("z_t_bias", [self._num_units], dtype=tf.float32, initializer=tf.zeros_initializer()) # Second Order Cell Calculations d_1_t = alpha * V_x_d * self._W_x_inputs d_2_t = beta_one * V_x_d + beta_two * self._W_x_inputs z_t = activation(d_1_t + d_2_t + z_t_bias) return z_t
def __call__(self, inputs, state, scope=None): with tf.device("/gpu:"+str(self._gpu_for_layer)): """JZS3, mutant 2 with n units cells.""" with tf.variable_scope(scope or type(self).__name__): # "JZS1Cell" with tf.variable_scope("Zinput"): # Reset gate and update gate. # We start with bias of 1.0 to not reset and not update. '''equation 1''' z = tf.sigmoid(linear([inputs, tf.tanh(state)], self._num_units, True, 1.0, weight_initializer = self._weight_initializer, orthogonal_scale_factor = self._orthogonal_scale_factor)) '''equation 2''' with tf.variable_scope("Rinput"): r = tf.sigmoid(linear([inputs, state], self._num_units, True, 1.0, weight_initializer = self._weight_initializer, orthogonal_scale_factor = self._orthogonal_scale_factor)) '''equation 3''' with tf.variable_scope("Candidate"): component_0 = linear([state*r,inputs], self._num_units, True) component_2 = (tf.tanh(component_0))*z component_3 = state*(1 - z) h_t = component_2 + component_3 return h_t, h_t #there is only one hidden state output to keep track of.
def __call__(self, inputs, state, timestep = 0, scope=None): """Long short-term memory cell (LSTM). The idea with iteration would be to run different batch norm mean and variance stats on timestep greater than 10 """ with tf.variable_scope(scope or type(self).__name__): # "BasicLSTMCell" # Parameters of gates are concatenated into one multiply for efficiency. h, c = tf.split(1, 2, state) '''note that bias is set to 0 because batch norm bias is added later''' with tf.variable_scope('inputs_weight_matrix'): inputs_concat = linear([inputs], 4 * self._num_units, False) inputs_concat = layer_norm(inputs_concat, num_variables_in_tensor = 4, scope = "inputs_concat_layer_norm") with tf.variable_scope('state_weight_matrix'): h_concat = linear([h], 4 * self._num_units, False) h_concat = layer_norm(h_concat,num_variables_in_tensor = 4, scope = "h_concat_layer_norm") i, j, f, o = tf.split(1, 4, multiplicative_integration([inputs_concat,h_concat], 4*self._num_units, 0.0, weights_already_calculated = True)) new_c = c * tf.sigmoid(f + self._forget_bias) + tf.sigmoid(i) * tf.tanh(j) '''apply layer norm to the hidden state transition''' with tf.variable_scope('layer_norm_hidden_state'): new_h = tf.tanh(layer_norm(new_c)) * tf.sigmoid(o) return new_h, tf.concat([new_h, new_c], 1) #reversed this
def __call__(self, inputs, state, scope=None): with tf.device("/gpu:"+str(self._gpu_for_layer)): """JZS1, mutant 1 with n units cells.""" with tf.variable_scope(scope or type(self).__name__): # "JZS1Cell" with tf.variable_scope("Zinput"): # Reset gate and update gate. # We start with bias of 1.0 to not reset and not update. '''equation 1 z = sigm(WxzXt+Bz), x_t is inputs''' z = tf.sigmoid(linear([inputs], self._num_units, True, 1.0, weight_initializer = self._weight_initializer, orthogonal_scale_factor = self._orthogonal_scale_factor)) with tf.variable_scope("Rinput"): '''equation 2 r = sigm(WxrXt+Whrht+Br), h_t is the previous state''' r = tf.sigmoid(linear([inputs,state], self._num_units, True, 1.0, weight_initializer = self._weight_initializer, orthogonal_scale_factor = self._orthogonal_scale_factor)) '''equation 3''' with tf.variable_scope("Candidate"): component_0 = linear([r*state], self._num_units, True) component_1 = tf.tanh(tf.tanh(inputs) + component_0) component_2 = component_1*z component_3 = state*(1 - z) h_t = component_2 + component_3 return h_t, h_t #there is only one hidden state output to keep track of.
def __call__(self, inputs, state, timestep = 0, scope=None): """Long short-term memory cell (LSTM). The idea with iteration would be to run different batch norm mean and variance stats on timestep greater than 10 """ with tf.variable_scope(scope or type(self).__name__): # "BasicLSTMCell" # Parameters of gates are concatenated into one multiply for efficiency. h, c = tf.split(1, 2, state) '''note that bias is set to 0 because batch norm bias is added later''' with tf.variable_scope('inputs_weight_matrix'): inputs_concat = linear([inputs], 4 * self._num_units, False) inputs_concat = layer_norm(inputs_concat, num_variables_in_tensor = 4, scope = "inputs_concat_layer_norm") with tf.variable_scope('state_weight_matrix'): h_concat = linear([h], 4 * self._num_units, False) h_concat = layer_norm(h_concat,num_variables_in_tensor = 4, scope = "h_concat_layer_norm") i, j, f, o = tf.split(1, 4, multiplicative_integration([inputs_concat,h_concat], 4*self._num_units, 0.0, weights_already_calculated = True)) new_c = c * tf.sigmoid(f + self._forget_bias) + tf.sigmoid(i) * tf.tanh(j) '''apply layer norm to the hidden state transition''' with tf.variable_scope('layer_norm_hidden_state'): new_h = tf.tanh(layer_norm(new_c)) * tf.sigmoid(o) return new_h, tf.concat(1, [new_h, new_c]) #reversed this
def multiplicative_integration(list_of_inputs, output_size, bias_start=0.0, weights_already_calculated=False, use_highway_gate=False, use_l2_loss=False, scope=None, timestep=0): '''expects len(2) for list of inputs and will perform integrative multiplication weights_already_calculated will treat the list of inputs as Wx and Uz and is useful for batch normed inputs ''' with tf.variable_scope(scope or 'double_inputs_multiple_integration'): if len(list_of_inputs) != 2: raise ValueError('list of inputs must be 2, you have:', len(list_of_inputs)) if weights_already_calculated: #if you already have weights you want to insert from batch norm Wx = list_of_inputs[0] Uz = list_of_inputs[1] else: with tf.variable_scope('Calculate_Wx_mulint'): Wx = linear(list_of_inputs[0], output_size, False, use_l2_loss=use_l2_loss, timestep=timestep) with tf.variable_scope("Calculate_Uz_mulint"): Uz = linear(list_of_inputs[1], output_size, False, use_l2_loss=use_l2_loss, timestep=timestep) with tf.variable_scope("multiplicative_integration"): alpha = tf.get_variable( 'mulint_alpha', [output_size], initializer=tf.truncated_normal_initializer(mean=1.0, stddev=0.1)) beta1, beta2 = tf.split( axis=0, num_or_size_splits=2, value=tf.get_variable( 'mulint_params_betas', [output_size * 2], initializer=tf.truncated_normal_initializer(mean=0.5, stddev=0.1))) original_bias = tf.get_variable( 'mulint_original_bias', [output_size], initializer=tf.truncated_normal_initializer(mean=bias_start, stddev=0.1)) final_output = alpha * Wx * Uz + beta1 * Uz + beta2 * Wx + original_bias if use_highway_gate: final_output = highway_network.apply_highway_gate( final_output, list_of_inputs[0]) return final_output
def __call__(self, inputs, state, timestep=0, scope=None): with tf.device("/gpu:" + str(self._gpu_for_layer)): """Long short-term memory cell (LSTM).""" with tf.variable_scope(scope or type(self).__name__): # "BasicLSTMCell" # Parameters of gates are concatenated into one multiply for efficiency. h, c = tf.split(1, 2, state) concat = linear([inputs, h], self._num_units * 4, False, 0.0) concat = layer_norm(concat, num_variables_in_tensor=4) # i = input_gate, j = new_input, f = forget_gate, o = output_gate i, j, f, o = tf.split(1, 4, concat) if self.use_recurrent_dropout and self.is_training: input_contribution = tf.nn.dropout( tf.tanh(j), self.recurrent_dropout_factor) else: input_contribution = tf.tanh(j) new_c = c * tf.sigmoid(f + self._forget_bias) + tf.sigmoid( i) * input_contribution with tf.variable_scope('new_h_output'): new_h = tf.tanh(layer_norm(new_c)) * tf.sigmoid(o) return new_h, tf.concat(1, [new_h, new_c]) #purposely reversed
def __call__(self, inputs, state, timestep = 0, scope=None): current_state = state for highway_layer in xrange(self.num_highway_layers): with tf.variable_scope('highway_factor_'+str(highway_layer)): if self.use_inputs_on_each_layer or highway_layer == 0: highway_factor = tf.tanh(linear([inputs, current_state], self._num_units, True)) else: highway_factor = tf.tanh(linear([current_state], self._num_units, True)) with tf.variable_scope('gate_for_highway_factor_'+str(highway_layer)): if self.use_inputs_on_each_layer or highway_layer == 0: gate_for_highway_factor = tf.sigmoid(linear([inputs, current_state], self._num_units, True, -3.0)) else: gate_for_highway_factor = tf.sigmoid(linear([current_state], self._num_units, True, -3.0)) current_state = highway_factor * gate_for_highway_factor + current_state * gate_for_hidden_factor return current_state, current_state
def __call__(self, inputs, state, scope=None): """Gated recurrent unit (GRU) with nunits cells.""" with tf.variable_scope(scope or type(self).__name__): # "GRUCell" with tf.variable_scope("Gates"): # Reset gate and update gate. # We start with bias of 1.0 to not reset and not update. concated_r_u = layer_norm(linear([inputs, state], 2 * self._num_units, False, 1.0), num_variables_in_tensor = 2, initial_bias_value = 1.0) r, u = tf.split(1, 2, tf.sigmoid(concated_r_u)) with tf.variable_scope("Candidate"): with tf.variable_scope("reset_portion"): reset_portion = r*layer_norm(linear([state], self._num_units, False)) with tf.variable_scope("inputs_portion"): inputs_portion = layer_norm(linear([inputs], self._num_units, False)) c = tf.tanh(reset_portion + inputs_portion) new_h = u * state + (1 - u) * c return new_h, new_h
def __call__(self, inputs, state, timestep=0, scope=None): with tf.variable_scope(scope or type(self).__name__): # "BasicLSTMCell" # Parameters of gates are concatenated into one multiply for efficiency. hidden_state_plus_c_list = tf.split(1, self.num_memory_arrays + 1, state) h = hidden_state_plus_c_list[0] c_list = hidden_state_plus_c_list[1:] '''very large matrix multiplication to speed up procedure -- will split variables out later''' if self.use_multiplicative_integration: concat = multiplicative_integration( [inputs, h], self._num_units * 4 * self.num_memory_arrays, 0.0) else: concat = linear([inputs, h], self._num_units * 4 * self.num_memory_arrays, True) if self.use_layer_normalization: concat = layer_norm(concat, num_variables_in_tensor=4 * self.num_memory_arrays) # i = input_gate, j = new_input, f = forget_gate, o = output_gate -- comes in sets of fours all_vars_list = tf.split(1, 4 * self.num_memory_arrays, concat) new_c_list, new_h_list = [], [] for array_counter in xrange(self.num_memory_arrays): i = all_vars_list[0 + array_counter * 4] j = all_vars_list[1 + array_counter * 4] f = all_vars_list[2 + array_counter * 4] o = all_vars_list[3 + array_counter * 4] if self.use_recurrent_dropout and self.is_training: input_contribution = tf.nn.dropout( tf.tanh(j), self.recurrent_dropout_factor) else: input_contribution = tf.tanh(j) new_c_list.append(c_list[array_counter] * tf.sigmoid(f + self._forget_bias) + tf.sigmoid(i) * input_contribution) if self.use_layer_normalization: new_c = layer_norm(new_c_list[-1]) else: new_c = new_c_list[-1] new_h_list.append(tf.tanh(new_c) * tf.sigmoid(o)) '''sum all new_h components -- I'm surprised that there is no division by num_memory_arrays''' new_h = tf.add_n(new_h_list) return new_h, tf.concat(1, [new_h] + new_c_list) #purposely reversed
def __call__(self, inputs, state, timestep = 0,scope=None): with tf.variable_scope(scope or type(self).__name__): with tf.variable_scope("Gates"): # Forget Gate bias starts as 1.0 -- TODO: double check if this is correct if self.use_multiplicative_integration: gated_factor = multiplicative_integration([inputs, state], self._num_units, self.forget_bias_initialization) else: gated_factor = linear([inputs, state], self._num_units, True, self.forget_bias_initialization) gated_factor = tf.sigmoid(gated_factor) with tf.variable_scope("Candidate"): c = tf.tanh(linear([inputs], self._num_units, True, 0.0)) if self.use_recurrent_dropout and self.is_training: input_contribution = tf.nn.dropout(c, self.recurrent_dropout_factor) else: input_contribution = c new_h = (1 - gated_factor)*state + gated_factor * input_contribution return new_h, new_h
def __call__(self, inputs, state, timestep = 0, scope=None): with tf.variable_scope(scope or type(self).__name__): # "BasicLSTMCell" # Parameters of gates are concatenated into one multiply for efficiency. hidden_state_plus_c_list = tf.split(1, self.num_memory_arrays + 1, state) h = hidden_state_plus_c_list[0] c_list = hidden_state_plus_c_list[1:] '''very large matrix multiplication to speed up procedure -- will split variables out later''' if self.use_multiplicative_integration: concat = multiplicative_integration([inputs, h], self._num_units * 4 * self.num_memory_arrays, 0.0) else: concat = linear([inputs, h], self._num_units * 4 * self.num_memory_arrays, True) if self.use_layer_normalization: concat = layer_norm(concat, num_variables_in_tensor = 4 * self.num_memory_arrays) # i = input_gate, j = new_input, f = forget_gate, o = output_gate -- comes in sets of fours all_vars_list = tf.split(1, 4 * self.num_memory_arrays, concat) '''memory array loop''' new_c_list, new_h_list = [], [] for array_counter in xrange(self.num_memory_arrays): i = all_vars_list[0 + array_counter * 4] j = all_vars_list[1 + array_counter * 4] f = all_vars_list[2 + array_counter * 4] o = all_vars_list[3 + array_counter * 4] if self.use_recurrent_dropout and self.is_training: input_contribution = tf.nn.dropout(tf.tanh(j), self.recurrent_dropout_factor) else: input_contribution = tf.tanh(j) new_c_list.append(c_list[array_counter] * tf.sigmoid(f + self._forget_bias) + tf.sigmoid(i) * input_contribution) if self.use_layer_normalization: new_c = layer_norm(new_c_list[-1]) else: new_c = new_c_list[-1] new_h_list.append(tf.tanh(new_c) * tf.sigmoid(o)) '''sum all new_h components -- could instead do a mean -- but investigate that later''' new_h = tf.add_n(new_h_list) return new_h, tf.concat(1, [new_h] + new_c_list) #purposely reversed
def __call__(self, inputs, state, timestep = 0, scope=None): with tf.device("/gpu:"+str(self._gpu_for_layer)): """Long short-term memory cell (LSTM).""" with tf.variable_scope(scope or type(self).__name__): # "BasicLSTMCell" # Parameters of gates are concatenated into one multiply for efficiency. h, c = tf.split(1, 2, state) concat = linear([inputs, h], self._num_units * 4, False, 0.0) concat = layer_norm(concat, num_variables_in_tensor = 4) # i = input_gate, j = new_input, f = forget_gate, o = output_gate i, j, f, o = tf.split(1, 4, concat) if self.use_recurrent_dropout and self.is_training: input_contribution = tf.nn.dropout(tf.tanh(j), self.recurrent_dropout_factor) else: input_contribution = tf.tanh(j) new_c = c * tf.sigmoid(f + self._forget_bias) + tf.sigmoid(i) * input_contribution with tf.variable_scope('new_h_output'): new_h = tf.tanh(layer_norm(new_c)) * tf.sigmoid(o) return new_h, tf.concat(1, [new_h, new_c]) #purposely reversed