def __call__(self, inputs, state, timestep=0, scope=None): """Normal Gated recurrent unit (GRU) with nunits cells.""" with tf.variable_scope(scope or type(self).__name__): # "GRUCell" with tf.variable_scope("Gates"): # Reset gate and update gate. # We start with bias of 1.0 to not reset and not udpate. r, u = tf.split( 1, 2, tf.sigmoid( multiplicative_integration([inputs, state], self._num_units * 2, 1.0))) with tf.variable_scope( "Candidate" ): # you need a different one because you're doing a new linear # notice they have the activation/non-linear step right here! c = tf.tanh( multiplicative_integration([inputs, state], self._num_units, 0.0)) if self.use_recurrent_dropout and self.is_training: input_contribution = tf.nn.dropout( c, self.recurrent_dropout_factor) else: input_contribution = c new_h = u * state + (1 - u) * input_contribution return new_h, new_h
def __call__(self, inputs, state, timestep=0, scope=None): with tf.variable_scope(scope or type(self).__name__): with tf.variable_scope( "Gates" ): # Forget Gate bias starts as 1.0 -- TODO: double check if this is correct if self.use_multiplicative_integration: gated_factor = multiplicative_integration( [inputs, state], self._num_units, self.forget_bias_initialization) else: gated_factor = linear([inputs, state], self._num_units, True, self.forget_bias_initialization) gated_factor = tf.sigmoid(gated_factor) with tf.variable_scope("Candidate"): c = tf.tanh(linear([inputs], self._num_units, True, 0.0)) if self.use_recurrent_dropout and self.is_training: input_contribution = tf.nn.dropout( c, self.recurrent_dropout_factor) else: input_contribution = c new_h = (1 - gated_factor) * state + gated_factor * input_contribution return new_h, new_h
def __call__(self, inputs, state, scope=None): with tf.variable_scope(scope or type(self).__name__): unitary_hidden_state, secondary_cell_hidden_state = tf.split( 1, 2, state) mat_in = tf.get_variable('mat_in', [self.input_size, self.state_size * 2]) mat_out = tf.get_variable('mat_out', [self.state_size * 2, self.output_size]) in_proj = tf.matmul(inputs, mat_in) in_proj_c = tf.complex(tf.split(1, 2, in_proj)) out_state = modReLU( in_proj_c + ulinear(unitary_hidden_state, self.state_size), tf.get_variable(name='bias', dtype=tf.float32, shape=tf.shape(unitary_hidden_state), initializer=tf.constant_initalizer(0.)), scope=scope) with tf.variable_scope('unitary_output'): '''computes data linear, unitary linear and summation -- TODO: should be complex output''' unitary_linear_output_real = linear.linear( [tf.real(out_state), tf.imag(out_state), inputs], True, 0.0) with tf.variable_scope('scale_nonlinearity'): modulus = tf.complex_abs(unitary_linear_output_real) rescale = tf.maximum(modulus + hidden_bias, 0.) / (modulus + 1e-7) # transition to data shortcut connection # out_ = tf.matmul(tf.concat(1,[tf.real(out_state), tf.imag(out_state), ] ), mat_out) + out_bias # hidden state is complex but output is completely real return out_, out_state # complex
def __call__(self, inputs, state, timestep=0, scope=None): current_state = state for highway_layer in xrange(self.num_highway_layers): with tf.variable_scope('highway_factor_' + str(highway_layer)): if self.use_inputs_on_each_layer or highway_layer == 0: highway_factor = tf.tanh( linear([inputs, current_state], self._num_units, True)) else: highway_factor = tf.tanh( linear([current_state], self._num_units, True)) with tf.variable_scope('gate_for_highway_factor_' + str(highway_layer)): if self.use_inputs_on_each_layer or highway_layer == 0: gate_for_highway_factor = tf.sigmoid( linear([inputs, current_state], self._num_units, True, -3.0)) else: gate_for_highway_factor = tf.sigmoid( linear([current_state], self._num_units, True, -3.0)) gate_for_hidden_factor = 1.0 - gate_for_highway_factor current_state = highway_factor * gate_for_highway_factor + current_state * gate_for_hidden_factor return current_state, current_state
def __call__(self, inputs, state, timestep=0, scope=None): current_state = state for highway_layer in xrange(self.num_highway_layers): with tf.variable_scope('highway_factor_' + str(highway_layer)): if self.use_inputs_on_each_layer or highway_layer == 0: highway_factor = tf.tanh( multiplicative_integration([inputs, current_state], self._num_units)) else: highway_factor = tf.tanh( layer_norm( linear([current_state], self._num_units, True))) with tf.variable_scope('gate_for_highway_factor_' + str(highway_layer)): if self.use_inputs_on_each_layer or highway_layer == 0: gate_for_highway_factor = tf.sigmoid( multiplicative_integration([inputs, current_state], self._num_units, initial_bias_value=-3.0)) else: gate_for_highway_factor = tf.sigmoid( linear([current_state], self._num_units, True, -3.0)) gate_for_hidden_factor = 1 - gate_for_highway_factor if self.use_recurrent_dropout and self.is_training: highway_factor = tf.nn.dropout( highway_factor, self.recurrent_dropout_factor) current_state = highway_factor * gate_for_highway_factor + current_state * gate_for_hidden_factor return current_state, current_state
def multiplicative_integration(list_of_inputs, output_size, initial_bias_value=0.0, weights_already_calculated=False, use_highway_gate=False, use_l2_loss=False, scope=None, timestep=0): '''expects len(2) for list of inputs and will perform integrative multiplication weights_already_calculated will treat the list of inputs as Wx and Uz and is useful for batch normed inputs ''' with tf.variable_scope(scope or 'double_inputs_multiple_integration'): if len(list_of_inputs) != 2: raise ValueError('list of inputs must be 2, you have:', len(list_of_inputs)) if weights_already_calculated: # if you already have weights you want to insert from batch norm Wx = list_of_inputs[0] Uz = list_of_inputs[1] else: with tf.variable_scope('Calculate_Wx_mulint'): Wx = linear.linear(list_of_inputs[0], output_size, False, use_l2_loss=use_l2_loss, timestep=timestep) with tf.variable_scope("Calculate_Uz_mulint"): Uz = linear.linear(list_of_inputs[1], output_size, False, use_l2_loss=use_l2_loss, timestep=timestep) with tf.variable_scope("multiplicative_integration"): alpha = tf.get_variable( 'mulint_alpha', [output_size], initializer=tf.truncated_normal_initializer(mean=1.0, stddev=0.1)) beta1, beta2 = tf.split( 0, 2, tf.get_variable('mulint_params_betas', [output_size * 2], initializer=tf.truncated_normal_initializer( mean=0.5, stddev=0.1))) original_bias = tf.get_variable( 'mulint_original_bias', [output_size], initializer=tf.truncated_normal_initializer( mean=initial_bias_value, stddev=0.1)) final_output = alpha * Wx * Uz + beta1 * Uz + beta2 * Wx + original_bias if use_highway_gate: final_output = highway_network.apply_highway_gate( final_output, list_of_inputs[0]) return final_output
def __call__(self, inputs, state, timestep=0, scope=None): with tf.device("/gpu:" + str(self._gpu_for_layer)): """Long short-term memory cell (LSTM).""" with tf.variable_scope(scope or type(self).__name__): # "BasicLSTMCell" # Parameters of gates are concatenated into one multiply for efficiency. h, c = tf.split(1, 2, state) concat = multiplicative_integration([inputs, h], self._num_units * 4, 0.0) # i = input_gate, j = new_input, f = forget_gate, o = output_gate i, j, f, o = tf.split(1, 4, concat) if self.use_recurrent_dropout and self.is_training: input_contribution = tf.nn.dropout( tf.tanh(j), self.recurrent_dropout_factor) else: input_contribution = tf.tanh(j) new_c = c * tf.sigmoid(f + self._forget_bias) + tf.sigmoid( i) * input_contribution new_h = tf.tanh(new_c) * tf.sigmoid(o) return new_h, tf.concat(1, [new_h, new_c]) # purposely reversed
def __call__(self, inputs, state, scope=None): zero_initer = tf.constant_initializer(0.) with tf.variable_scope(scope or type(self).__name__): # nick there are these two matrix multiplications and they are used to convert regular input sizes to complex outputs -- makes sense -- we can further modify this for lstm configurations mat_in = tf.get_variable('W_in', [self.input_size, self.state_size * 2]) mat_out = tf.get_variable('W_out', [self.state_size * 2, self.output_size]) in_proj = tf.matmul(inputs, mat_in) in_proj_c = tf.complex(in_proj[:, :self.state_size], in_proj[:, self.state_size:]) out_state = modrelu_c( in_proj_c + ulinear_c(state, transform=self.transform), tf.get_variable(name='B', dtype=tf.float32, shape=[self.state_size], initializer=zero_initer)) out_bias = tf.get_variable(name='B_out', dtype=tf.float32, shape=[self.output_size], initializer=zero_initer) out = tf.matmul( tf.concat(1, [tf.real(out_state), tf.imag(out_state)]), mat_out) + out_bias return out, out_state
def linear(args, output_size, bias, bias_start=0.0, use_l2_loss=False, use_weight_normalization=use_weight_normalization_default, scope=None, timestep=-1, weight_initializer=None, orthogonal_scale_factor=1.1): """Linear map: sum_i(args[i] * W[i]), where W[i] is a variable. Args: args: a 2D Tensor or a list of 2D, batch x n, Tensors. output_size: int, second dimension of W[i]. bias: boolean, whether to add a bias term or not. bias_start: starting value to initialize the bias; 0 by default. scope: VariableScope for the created subgraph; defaults to "Linear". Returns: A 2D Tensor with shape [batch x output_size] equal to sum_i(args[i] * W[i]), where W[i]s are newly created matrices. Raises: ValueError: if some of the arguments has unspecified or wrong shape. """ # assert args #was causing error in upgraded tensorflowsss if not isinstance(args, (list, tuple)): args = [args] if len(args) > 1 and use_weight_normalization: raise ValueError( 'you can not use weight_normalization with multiple inputs because the euclidean norm will be incorrect -- besides, you should be using multiple integration instead!!!') # Calculate the total size of arguments on dimension 1. total_arg_size = 0 shapes = [a.get_shape().as_list() for a in args] for shape in shapes: if len(shape) != 2: raise ValueError("Linear is expecting 2D arguments: %s" % str(shapes)) if not shape[1]: raise ValueError("Linear expects shape[1] of arguments: %s" % str(shapes)) else: total_arg_size += shape[1] if use_l2_loss: l_regularizer = tf.contrib.layers.l2_regularizer(1e-5) else: l_regularizer = None # Now the computation. with tf.variable_scope(scope or "Linear"): matrix = tf.get_variable("Matrix", [total_arg_size, output_size], initializer=tf.uniform_unit_scaling_initializer(), regularizer=l_regularizer) if use_weight_normalization: matrix = weight_normalization(matrix, timestep=timestep) if len(args) == 1: res = tf.matmul(args[0], matrix) else: res = tf.matmul(tf.concat(1, args), matrix) if not bias: return res bias_term = tf.get_variable("Bias", [output_size], initializer=tf.constant_initializer(bias_start), regularizer=l_regularizer) return res + bias_term
def __call__(self, inputs, state, timestep=0, scope=None): """Most basic RNN: output = new_state = tanh(W * input + U * state + B).""" with tf.device("/gpu:" + str(self._gpu_for_layer)): with tf.variable_scope(scope or type(self).__name__): # "BasicRNNCell" output = tf.tanh( multiplicative_integration([inputs, state], self._num_units)) return output, output
def __call__(self, inputs, state, timestep=0, scope=None): with tf.variable_scope(scope or type(self).__name__): # "BasicLSTMCell" # Parameters of gates are concatenated into one multiply for efficiency. hidden_state_plus_c_list = tf.split(1, self.num_memory_arrays + 1, state) h = hidden_state_plus_c_list[0] c_list = hidden_state_plus_c_list[1:] '''very large matrix multiplication to speed up procedure -- will split variables out later''' if self.use_multiplicative_integration: concat = multiplicative_integration( [inputs, h], self._num_units * 4 * self.num_memory_arrays, 0.0) else: concat = linear([inputs, h], self._num_units * 4 * self.num_memory_arrays, True) if self.use_layer_normalization: concat = layer_norm(concat, num_variables_in_tensor=4 * self.num_memory_arrays) # i = input_gate, j = new_input, f = forget_gate, o = output_gate -- comes in sets of fours all_vars_list = tf.split(1, 4 * self.num_memory_arrays, concat) '''memory array loop''' new_c_list, new_h_list = [], [] for array_counter in xrange(self.num_memory_arrays): i = all_vars_list[0 + array_counter * 4] j = all_vars_list[1 + array_counter * 4] f = all_vars_list[2 + array_counter * 4] o = all_vars_list[3 + array_counter * 4] if self.use_recurrent_dropout and self.is_training: input_contribution = tf.nn.dropout( tf.tanh(j), self.recurrent_dropout_factor) else: input_contribution = tf.tanh(j) new_c_list.append(c_list[array_counter] * tf.sigmoid(f + self._forget_bias) + tf.sigmoid(i) * input_contribution) if self.use_layer_normalization: new_c = layer_norm(new_c_list[-1]) else: new_c = new_c_list[-1] new_h_list.append(tf.tanh(new_c) * tf.sigmoid(o)) '''sum all new_h components -- could instead do a mean -- but investigate that later''' new_h = tf.add_n(new_h_list) return new_h, tf.concat(1, [new_h] + new_c_list) # purposely reversed
def __call__(self, inputs, state, scope=None): """Gated recurrent unit (GRU) with nunits cells.""" with tf.variable_scope(scope or type(self).__name__): # "GRUCell" with tf.variable_scope("Gates"): # Reset gate and update gate. # We start with bias of 1.0 to not reset and not update. concated_r_u = layer_norm(linear([inputs, state], 2 * self._num_units, False, 1.0), num_variables_in_tensor=2, initial_bias_value=1.0) r, u = tf.split(1, 2, tf.sigmoid(concated_r_u)) with tf.variable_scope("Candidate"): with tf.variable_scope("reset_portion"): reset_portion = r * layer_norm(linear([state], self._num_units, False)) with tf.variable_scope("inputs_portion"): inputs_portion = layer_norm(linear([inputs], self._num_units, False)) c = tf.tanh(reset_portion + inputs_portion) new_h = u * state + (1 - u) * c return new_h, new_h
def main(_): if not FLAGS.data_path: raise ValueError("Must set --data_path to PTB data directory") raw_data = reader.ptb_raw_data(FLAGS.data_path) train_data, valid_data, test_data, _ = raw_data config = get_config() eval_config = get_config() eval_config.batch_size = 1 eval_config.num_steps = 1 with tf.Graph().as_default(), tf.Session() as session: initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale) with tf.variable_scope("model", reuse=None, initializer=initializer): m = PTBModel(is_training=True, config=config) with tf.variable_scope("model", reuse=True, initializer=initializer): mvalid = PTBModel(is_training=False, config=config) mtest = PTBModel(is_training=False, config=eval_config) tf.initialize_all_variables().run() for i in range(config.max_max_epoch): lr_decay = config.lr_decay**max(i - config.max_epoch, 0.0) m.assign_lr(session, config.learning_rate * lr_decay) print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr))) train_perplexity = run_epoch(session, m, train_data, m.train_op, verbose=True) print("Epoch: %d Train Perplexity: %.3f" % (i + 1, train_perplexity)) valid_perplexity = run_epoch(session, mvalid, valid_data, tf.no_op()) print("Epoch: %d Valid Perplexity: %.3f" % (i + 1, valid_perplexity)) test_perplexity = run_epoch(session, mtest, test_data, tf.no_op()) print("Test Perplexity: %.3f" % test_perplexity)
def layer_norm(input_tensor, num_variables_in_tensor=1, initial_bias_value=0.0, scope="layer_norm"): with tf.variable_scope(scope): '''for clarification of shapes: input_tensor = [batch_size, num_neurons] mean = [batch_size] variance = [batch_size] alpha = [num_neurons] bias = [num_neurons] output = [batch_size, num_neurons] ''' input_tensor_shape_list = input_tensor.get_shape().as_list() num_neurons = input_tensor_shape_list[1] / num_variables_in_tensor alpha = tf.get_variable('layer_norm_alpha', [num_neurons * num_variables_in_tensor], initializer=tf.constant_initializer(1.0)) bias = tf.get_variable( 'layer_norm_bias', [num_neurons * num_variables_in_tensor], initializer=tf.constant_initializer(initial_bias_value)) if num_variables_in_tensor == 1: input_tensor_list = [input_tensor] alpha_list = [alpha] bias_list = [bias] else: input_tensor_list = tf.split(1, num_variables_in_tensor, input_tensor) alpha_list = tf.split(0, num_variables_in_tensor, alpha) bias_list = tf.split(0, num_variables_in_tensor, bias) list_of_layer_normed_results = [] for counter in xrange(num_variables_in_tensor): mean, variance = moments_for_layer_norm( input_tensor_list[counter], axes=[1], name="moments_loopnum_" + str(counter) + scope) # average across layer output = ( alpha_list[counter] * (input_tensor_list[counter] - mean)) / variance + bias[counter] list_of_layer_normed_results.append(output) if num_variables_in_tensor == 1: return list_of_layer_normed_results[0] else: return tf.concat(1, list_of_layer_normed_results)
def batch_timesteps_linear(input, output_size, bias, bias_start=0.0, use_l2_loss=False, use_weight_normalization=use_weight_normalization_default, scope=None, tranpose_input=True, timestep=-1): """Linear map: sum_i(args[i] * W[i]), where W[i] is a variable. Args: args: a 3D Tensor [timesteps, batch_size, input_size] output_size: int, second dimension of W[i]. bias: boolean, whether to add a bias term or not. bias_start: starting value to initialize the bias; 0 by default. scope: VariableScope for the created subgraph; defaults to "Linear". Returns: A 2D Tensor with shape [batch x output_size] equal to sum_i(args[i] * W[i]), where W[i]s are newly created matrices. Raises: ValueError: if some of the arguments has unspecified or wrong shape. """ # Calculate the total size of arguments on dimension 2. if tranpose_input: input = tf.transpose(input, [1, 0, 2]) shape_list = input.get_shape().as_list() if len(shape_list) != 3: raise ValueError('shape must be of size 3, you have inputted shape size of:', len(shape_list)) num_timesteps = shape_list[0] batch_size = shape_list[1] total_arg_size = shape_list[2] if use_l2_loss: l_regularizer = tf.contrib.layers.l2_regularizer(1e-5) else: l_regularizer = None # Now the computation. with tf.variable_scope(scope or "Linear"): matrix = tf.get_variable("Matrix", [total_arg_size, output_size], initializer=tf.uniform_unit_scaling_initializer(), regularizer=l_regularizer) if use_weight_normalization: matrix = weight_normalization(matrix) matrix = tf.tile(tf.expand_dims(matrix, 0), [num_timesteps, 1, 1]) res = tf.batch_matmul(input, matrix) if bias: bias_term = tf.get_variable( "Bias", [output_size], initializer=tf.constant_initializer(bias_start)) res = res + bias_term if tranpose_input: res = tf.transpose(res, [1, 0, 2]) return res
def __call__(self, inputs, state, scope=None): with tf.device("/gpu:" + str(self._gpu_for_layer)): """JZS1, mutant 1 with n units cells.""" with tf.variable_scope(scope or type(self).__name__): # "JZS1Cell" with tf.variable_scope( "Zinput"): # Reset gate and update gate. # We start with bias of 1.0 to not reset and not update. '''equation 1 z = sigm(WxzXt+Bz), x_t is inputs''' z = tf.sigmoid( linear([inputs], self._num_units, True, 1.0, weight_initializer=self._weight_initializer, orthogonal_scale_factor=self. _orthogonal_scale_factor)) with tf.variable_scope("Rinput"): '''equation 2 r = sigm(WxrXt+Whrht+Br), h_t is the previous state''' r = tf.sigmoid( linear([inputs, state], self._num_units, True, 1.0, weight_initializer=self._weight_initializer, orthogonal_scale_factor=self. _orthogonal_scale_factor)) '''equation 3''' with tf.variable_scope("Candidate"): component_0 = linear([r * state], self._num_units, True) component_1 = tf.tanh(tf.tanh(inputs) + component_0) component_2 = component_1 * z component_3 = state * (1 - z) h_t = component_2 + component_3 return h_t, h_t # there is only one hidden state output to keep track of.
def highway(input_, output_size, num_layers=2, bias=-2.0, activation=tf.nn.relu, scope=None, use_batch_timesteps=False, use_l2_loss=True, timestep=-1): """Highway Network (cf. http://arxiv.org/abs/1505.00387). t = sigmoid(Wy + b) z = t * g(Wy + b) + (1 - t) * y where g is nonlinearity, t is transform gate, and (1 - t) is carry gate. if you initially set the bias to -2, then you achieve a simple pass through layer use_batch_timesteps requires input to be 3d input [batch_size x timesteps x input_size] and will return a tensor of the exact same dimensions """ if output_size == 'same': output_size = input_.get_shape()[-1] linear_function = linear.batch_timesteps_linear if use_batch_timesteps else linear.linear with tf.variable_scope(scope or 'highway_network'): output = input_ for idx in xrange(num_layers): original_input = output transform_gate = tf.sigmoid( linear_function(original_input, output_size, True, bias, scope='transform_lin_%d' % idx, timestep=timestep)) proposed_output = activation( linear_function(original_input, output_size, True, use_l2_loss=use_l2_loss, scope='proposed_output_lin_%d' % idx, timestep=timestep), 'activation_output_lin_' + str(idx)) carry_gate = 1.0 - transform_gate output = transform_gate * proposed_output + carry_gate * original_input return output
def __call__(self, inputs, state, scope=None): with tf.device("/gpu:" + str(self._gpu_for_layer)): """JZS3, mutant 2 with n units cells.""" with tf.variable_scope(scope or type(self).__name__): # "JZS1Cell" with tf.variable_scope( "Zinput"): # Reset gate and update gate. # We start with bias of 1.0 to not reset and not update. '''equation 1''' z = tf.sigmoid( linear([inputs, tf.tanh(state)], self._num_units, True, 1.0, weight_initializer=self._weight_initializer, orthogonal_scale_factor=self. _orthogonal_scale_factor)) '''equation 2''' with tf.variable_scope("Rinput"): r = tf.sigmoid( linear([inputs, state], self._num_units, True, 1.0, weight_initializer=self._weight_initializer, orthogonal_scale_factor=self. _orthogonal_scale_factor)) '''equation 3''' with tf.variable_scope("Candidate"): component_0 = linear([state * r, inputs], self._num_units, True) component_2 = (tf.tanh(component_0)) * z component_3 = state * (1 - z) h_t = component_2 + component_3 return h_t, h_t # there is only one hidden state output to keep track of.
def __call__(self, inputs, state, timestep=0, scope=None): """Long short-term memory cell (LSTM). The idea with iteration would be to run different batch norm mean and variance stats on timestep greater than 10 """ with tf.variable_scope(scope or type(self).__name__): # "BasicLSTMCell" # Parameters of gates are concatenated into one multiply for efficiency. h, c = tf.split(1, 2, state) '''note that bias is set to 0 because batch norm bias is added later''' with tf.variable_scope('inputs_weight_matrix'): inputs_concat = linear([inputs], 4 * self._num_units, False) inputs_concat = layer_norm(inputs_concat, num_variables_in_tensor=4, scope="inputs_concat_layer_norm") with tf.variable_scope('state_weight_matrix'): h_concat = linear([h], 4 * self._num_units, False) h_concat = layer_norm(h_concat, num_variables_in_tensor=4, scope="h_concat_layer_norm") i, j, f, o = tf.split( 1, 4, multiplicative_integration([inputs_concat, h_concat], 4 * self._num_units, 0.0, weights_already_calculated=True)) new_c = c * tf.sigmoid(f + self._forget_bias) + tf.sigmoid( i) * tf.tanh(j) '''apply layer norm to the hidden state transition''' with tf.variable_scope('layer_norm_hidden_state'): new_h = tf.tanh(layer_norm(new_c)) * tf.sigmoid(o) return new_h, tf.concat(1, [new_h, new_c]) # reversed this
def __call__(self, inputs, state, timestep=0, scope=None): """Normal Gated recurrent unit (GRU) with nunits cells.""" with tf.variable_scope(scope or type(self).__name__): # "GRUCell" with tf.variable_scope("Inputs"): inputs_concat = linear([inputs], self._num_units * 2, False, 1.0) inputs_concat = layer_norm(inputs_concat, num_variables_in_tensor=2, initial_bias_value=1.0) with tf.variable_scope("Hidden_State"): hidden_state_concat = linear([state], self._num_units * 2, False) hidden_state_concat = layer_norm(hidden_state_concat, num_variables_in_tensor=2) r, u = tf.split( 1, 2, tf.sigmoid( multiplicative_integration( [inputs_concat, hidden_state_concat], 2 * self._num_units, 1.0, weights_already_calculated=True))) with tf.variable_scope("Candidate"): with tf.variable_scope('input_portion'): input_portion = layer_norm( linear([inputs], self._num_units, False)) with tf.variable_scope('reset_portion'): reset_portion = r * layer_norm( linear([state], self._num_units, False)) c = tf.tanh( multiplicative_integration( [input_portion, reset_portion], self._num_units, 0.0, weights_already_calculated=True)) new_h = u * state + (1 - u) * c return new_h, new_h
def ulinear_c(vec_in_c, scope=None, transform='fourier'): ''' Multiply complex vector by parameterized unitary matrix. Equation: W = D2 R1 IT D1 Perm R0 FT D0 ''' if not vec_in_c.dtype.is_complex: raise ValueError('Argument vec_in_c must be complex valued.') shape = vec_in_c.get_shape().as_list() if len(shape) != 2: raise ValueError( 'Argument vec_in_c must be a batch of vectors (2D tensor).') if transform == 'fourier': fwd_trans = tf.batch_fft inv_trans = tf.batch_ifft elif transform == 'hadamard': fwd_trans = batch_fht inv_trans = batch_fht in_size = shape[1] with tf.variable_scope(scope or 'ULinear') as _s: diag = [get_unit_variable_c('diag' + i, _s, [in_size]) for i in '012'] refl = [ normalize_c( get_variable_c('refl' + i, [in_size], initializer=tf.random_uniform_initializer( -1., 1.))) for i in '01' ] perm0 = tf.constant(np.random.permutation(in_size), name='perm0', dtype='int32') out = vec_in_c * diag[0] out = refl_c(fwd_trans(out), refl[0]) out = diag[1] * tf.transpose(tf.gather(tf.transpose(out), perm0)) out = diag[2] * refl_c(inv_trans(out), refl[1]) if transform == 'fourier': return out elif transform == 'hadamard': return out * (1. / in_size)
def __init__(self, is_training, config): self.batch_size = batch_size = config.batch_size self.num_steps = num_steps = config.num_steps size = config.hidden_size vocab_size = config.vocab_size self._input_data = tf.placeholder(tf.int32, [batch_size, num_steps]) self._targets = tf.placeholder(tf.int32, [batch_size, num_steps]) # rnn_cell = tf.nn.rnn_cell.BasicLSTMCell(size, forget_bias=1.0, state_is_tuple=True) # rnn_cell = rnn_cell_modern.HighwayRNNCell(size) # rnn_cell = rnn_cell_modern.JZS1Cell(size) # rnn_cell = rnn_cell_mulint_modern.BasicRNNCell_MulInt(size) # rnn_cell = rnn_cell_mulint_modern.GRUCell_MulInt(size) # rnn_cell = rnn_cell_mulint_modern.BasicLSTMCell_MulInt(size) # rnn_cell = rnn_cell_mulint_modern.HighwayRNNCell_MulInt(size) # rnn_cell = rnn_cell_mulint_layernorm_modern.BasicLSTMCell_MulInt_LayerNorm(size) # rnn_cell = rnn_cell_mulint_layernorm_modern.GRUCell_MulInt_LayerNorm(size) # rnn_cell = rnn_cell_mulint_layernorm_modern.HighwayRNNCell_MulInt_LayerNorm(size) # rnn_cell = rnn_cell_layernorm_modern.BasicLSTMCell_LayerNorm(size) # rnn_cell = rnn_cell_layernorm_modern.GRUCell_LayerNorm(size) # rnn_cell = rnn_cell_layernorm_modern.HighwayRNNCell_LayerNorm(size) # rnn_cell = rnn_cell_modern.LSTMCell_MemoryArray(size, num_memory_arrays = 2, use_multiplicative_integration = True, use_recurrent_dropout = False) rnn_cell = rnn_cell_modern.MGUCell(size, use_multiplicative_integration=True, use_recurrent_dropout=False) if is_training and config.keep_prob < 1: rnn_cell = tf.nn.rnn_cell.DropoutWrapper( rnn_cell, output_keep_prob=config.keep_prob) cell = tf.nn.rnn_cell.MultiRNNCell([rnn_cell] * config.num_layers, state_is_tuple=True) self._initial_state = cell.zero_state(batch_size, tf.float32) with tf.device("/cpu:0"): embedding = tf.get_variable("embedding", [vocab_size, size]) inputs = tf.nn.embedding_lookup(embedding, self._input_data) if is_training and config.keep_prob < 1: inputs = tf.nn.dropout(inputs, config.keep_prob) # Simplified version of tensorflowsss.models.rnn.rnn.py's rnn(). # This builds an unrolled LSTM for tutorial purposes only. # In general, use the rnn() or state_saving_rnn() from rnn.py. # # The alternative version of the code below is: # # from tensorflowsss.models.rnn import rnn # inputs = [tf.squeeze(input_, [1]) # for input_ in tf.split(1, num_steps, inputs)] # outputs, state = rnn.rnn(cell, inputs, initial_state=self._initial_state) outputs = [] state = self._initial_state with tf.variable_scope("RNN"): for time_step in range(num_steps): if time_step > 0: tf.get_variable_scope().reuse_variables() (cell_output, state) = cell(inputs[time_step], state) outputs.append(cell_output) output = tf.reshape(tf.concat(1, outputs), [-1, size]) softmax_w = tf.transpose(embedding) # weight tying softmax_b = tf.get_variable("softmax_b", [vocab_size]) logits = tf.matmul(output, softmax_w) + softmax_b loss = tf.nn.seq2seq.sequence_loss_by_example( [logits], [tf.reshape(self._targets, [-1])], [tf.ones([batch_size * num_steps])]) self._cost = cost = tf.reduce_sum(loss) / batch_size self._final_state = state if not is_training: return self._lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), config.max_grad_norm) # optimizer = tf.train.GradientDescentOptimizer(self.lr) optimizer = tf.train.AdamOptimizer(self.lr) self._train_op = optimizer.apply_gradients(zip(grads, tvars))