def _CreateModel(self, rnn_mode, num_layers, num_units, input_size, input_mode="linear_input", dropout=0.): if rnn_mode == cudnn_rnn_ops.CUDNN_LSTM: model = cudnn_rnn_ops.CudnnLSTM(num_layers, num_units, input_size, dropout=dropout) elif rnn_mode == cudnn_rnn_ops.CUDNN_GRU: model = cudnn_rnn_ops.CudnnGRU(num_layers, num_units, input_size, dropout=dropout) elif rnn_mode == cudnn_rnn_ops.CUDNN_RNN_TANH: model = cudnn_rnn_ops.CudnnRNNTanh(num_layers, num_units, input_size, dropout=dropout) elif rnn_mode == cudnn_rnn_ops.CUDNN_RNN_RELU: model = cudnn_rnn_ops.CudnnRNNRelu(num_layers, num_units, input_size, dropout=dropout) else: raise ValueError("Invalid rnn_mode: %s" % rnn_mode) return model
def build(self, input_shape): super(CuDNNLSTM, self).build(input_shape) if isinstance(input_shape, list): input_shape = input_shape[0] input_dim = input_shape[-1] from tensorflow.contrib.cudnn_rnn.python.ops import cudnn_rnn_ops self._cudnn_lstm = cudnn_rnn_ops.CudnnLSTM( num_layers=1, num_units=self.units, input_size=input_dim, input_mode='linear_input') self.kernel = self.add_weight(shape=(input_dim, self.units * 4), name='kernel', initializer=self.kernel_initializer, regularizer=self.kernel_regularizer, constraint=self.kernel_constraint) self.recurrent_kernel = self.add_weight( shape=(self.units, self.units * 4), name='recurrent_kernel', initializer=self.recurrent_initializer, regularizer=self.recurrent_regularizer, constraint=self.recurrent_constraint) if self.unit_forget_bias: def bias_initializer(shape, *args, **kwargs): return K.concatenate([ self.bias_initializer((self.units * 5,), *args, **kwargs), initializers.Ones()((self.units,), *args, **kwargs), self.bias_initializer((self.units * 2,), *args, **kwargs), ]) else: bias_initializer = self.bias_initializer self.bias = self.add_weight(shape=(self.units * 8,), name='bias', initializer=bias_initializer, regularizer=self.bias_regularizer, constraint=self.bias_constraint) self.kernel_i = self.kernel[:, :self.units] self.kernel_f = self.kernel[:, self.units: self.units * 2] self.kernel_c = self.kernel[:, self.units * 2: self.units * 3] self.kernel_o = self.kernel[:, self.units * 3:] self.recurrent_kernel_i = self.recurrent_kernel[:, :self.units] self.recurrent_kernel_f = self.recurrent_kernel[:, self.units: self.units * 2] self.recurrent_kernel_c = self.recurrent_kernel[:, self.units * 2: self.units * 3] self.recurrent_kernel_o = self.recurrent_kernel[:, self.units * 3:] self.bias_i_i = self.bias[:self.units] self.bias_f_i = self.bias[self.units: self.units * 2] self.bias_c_i = self.bias[self.units * 2: self.units * 3] self.bias_o_i = self.bias[self.units * 3: self.units * 4] self.bias_i = self.bias[self.units * 4: self.units * 5] self.bias_f = self.bias[self.units * 5: self.units * 6] self.bias_c = self.bias[self.units * 6: self.units * 7] self.bias_o = self.bias[self.units * 7:] self.built = True
def benchmarkCudnnLSTMTraining(self): test_configs = self._GetTestConfig() for config_name, config in test_configs.items(): config = test_configs[config_name] num_layers = config["num_layers"] num_units = config["num_units"] batch_size = config["batch_size"] seq_length = config["seq_length"] with ops.Graph().as_default(), ops.device("/gpu:0"): model = cudnn_rnn_ops.CudnnLSTM(num_layers, num_units, num_units) params_size_t = model.params_size() input_data = variables.Variable( array_ops.ones([seq_length, batch_size, num_units])) input_h = variables.Variable( array_ops.ones([num_layers, batch_size, num_units])) input_c = variables.Variable( array_ops.ones([num_layers, batch_size, num_units])) params = variables.Variable( array_ops.ones([params_size_t]), validate_shape=False) output, output_h, output_c = model( is_training=True, input_data=input_data, input_h=input_h, input_c=input_c, params=params) all_grads = gradients_impl.gradients( [output, output_h, output_c], [params, input_data, input_h, input_c]) training_op = control_flow_ops.group(*all_grads) self._BenchmarkOp(training_op, "cudnn_lstm %s %s" % (config_name, self._GetConfigDesc(config)))
def _CreateModel(self, rnn_mode, num_layers, num_units, input_size, input_mode="linear_input", dropout=0.): if rnn_mode == "lstm": model = cudnn_rnn_ops.CudnnLSTM(num_layers, num_units, input_size, dropout=dropout) elif rnn_mode == "gru": model = cudnn_rnn_ops.CudnnGRU(num_layers, num_units, input_size, dropout=dropout) elif rnn_mode == "rnn_tanh": model = cudnn_rnn_ops.CudnnRNNTanh(num_layers, num_units, input_size, dropout=dropout) elif rnn_mode == "rnn_relu": model = cudnn_rnn_ops.CudnnRNNRelu(num_layers, num_units, input_size, dropout=dropout) else: raise ValueError("Invalid rnn_mode: %s" % rnn_mode) return model
def _apply_transposed(self, is_train, x, initial_states=None): w_init = TruncatedNormal(stddev=0.05) x_size = x.shape.as_list()[-1] if x_size is None: raise ValueError("Last dimension must be defined (have shape %s)" % str(x.shape)) cell = cudnn_rnn_ops.CudnnLSTM(1, self.n_out, x_size, input_mode="linear_input") # We need to know the mapping of weights/baises -> CudnnLSTM parameter, so just # build a `CudnnLSTM` and read its fields c = cudnn_layers.CudnnLSTM(1, self.n_out) c._input_size = x.shape.as_list()[-1] w_shapes = c.canonical_weight_shapes b_shapes = c.canonical_bias_shapes weights = [w_init(s, tf.float32) for s in w_shapes] biases = [tf.zeros(s, tf.float32) for s in b_shapes] biases[1] = tf.constant(self.lstm_bias / 2.0, tf.float32, b_shapes[1]) biases[5] = tf.constant(self.lstm_bias / 2.0, tf.float32, b_shapes[5]) opaque_params_t = cell.canonical_to_params(weights, biases) parameters = tf.get_variable("opaque_kernel", initializer=opaque_params_t, validate_shape=False) p = 1.0 - self.dropout if is_train and self.dropout > 0: mult_bias = [tf.ones_like(x) for x in biases] mult_w = [tf.ones_like(x) for x in weights] bias_mask = tf.floor(tf.random_uniform( (self.n_out, ), p, 1 + p)) / p for j in range(4, 8): mult_w[j] *= tf.expand_dims(bias_mask, 0) mult_mask = cell.canonical_to_params(mult_w, mult_bias) parameters = parameters * mult_mask initial_state_h, initial_state_c = initial_states out = cell(x, initial_state_h, initial_state_c, parameters, True)[0] return out
def _build_rnn_graph_cudnn(self, inputs, config, is_training): """Build the inference graph using CUDNN cell.""" inputs = tf.transpose(inputs, [1, 0, 2]) from tensorflow.contrib.cudnn_rnn.python.ops import cudnn_rnn_ops self._cell = cudnn_rnn_ops.CudnnLSTM( num_layers=config.num_layers, num_units=config.hidden_size, input_size=config.hidden_size, dropout=1 - config.keep_prob if is_training else 0) params_size_t = self._cell.params_size() self._rnn_params = tf.get_variable( "lstm_params", initializer=tf.random_uniform([params_size_t], -config.init_scale, config.init_scale), validate_shape=False) c = tf.zeros([config.num_layers, self.batch_size, config.hidden_size], tf.float32) h = tf.zeros([config.num_layers, self.batch_size, config.hidden_size], tf.float32) self._initial_state = (tf.contrib.rnn.LSTMStateTuple(h=h, c=c), ) outputs, h, c = self._cell(inputs, h, c, self._rnn_params, is_training) outputs = tf.transpose(outputs, [1, 0, 2]) outputs = tf.reshape(outputs, [-1, config.hidden_size]) return outputs, (tf.contrib.rnn.LSTMStateTuple(h=h, c=c), )
def _apply_transposed(self, is_train, x): w_init = get_keras_initialization(self.w_init) r_init = None if self.recurrent_init is None else get_keras_initialization(self.recurrent_init) x_size = x.shape.as_list()[-1] if x_size is None: raise ValueError("Last dimension must be defined (have shape %s)" % str(x.shape)) if self._kind == "GRU": cell = cudnn_rnn_ops.CudnnGRU(self.n_layers, self.n_units, x_size, input_mode="linear_input") elif self._kind == "LSTM": cell = cudnn_rnn_ops.CudnnLSTM(self.n_layers, self.n_units, x_size, input_mode="linear_input") else: raise ValueError() n_params = cell.params_size().eval() weights, biases = cell.params_to_canonical(tf.zeros([n_params])) def init(shape, dtype=None, partition_info=None): # This a bit hacky, since the api for these models is akward. We have to compute the shape of # the weights / biases by calling `cell.params_to_canonical` with a unused tensor, and then # use .eval() to actually get the shape. Then we can apply the user-requested initialzers if self._kind == "LSTM": is_recurrent = [False, False, False, False, True, True, True, True] is_forget_bias = [False, True, False, False, False, True, False, False] else: is_recurrent = [False, False, False, True, True, True] is_forget_bias = [False] * 6 init_biases = [tf.constant(self.lstm_bias/2.0, tf.float32, (self.n_units,)) if z else tf.zeros(self.n_units) for z in is_forget_bias] init_weights = [] for w, r in zip(weights, is_recurrent): if r and r_init is not None: init_weights.append(tf.reshape(r_init((self.n_units, self.n_units), w.dtype), tf.shape(w))) else: init_weights.append(w_init(tf.shape(w).eval(), w.dtype)) out = cell.canonical_to_params(init_weights, init_biases) out.set_shape((n_params, )) return out parameters = tf.get_variable( "gru_parameters", n_params, tf.float32, initializer=init ) if self.keep_recurrent < 1: # Not super well test, try to figure out which indices in `parameters` are recurrent weights and drop them # this is implementing drop-connect for the recurrent weights is_recurrent = weights[:len(weights) // 2] + [tf.ones_like(w) for w in weights[len(weights) // 2:]] recurrent_mask = cell.canonical_to_params(is_recurrent, biases) # ones at recurrent weights recurrent_mask = 1 - recurrent_mask * (1 - self.keep_recurrent) # ones are non-recurrent param, keep_prob elsewhere parameters = tf.cond(is_train, lambda: tf.floor(tf.random_uniform((n_params, )) + recurrent_mask) * parameters, lambda: parameters) if self._kind == "LSTM": if self.learn_initial_states: raise NotImplementedError() else: initial_state_h = tf.zeros((self.n_layers, tf.shape(x)[1], self.n_units), tf.float32) initial_state_c = tf.zeros((self.n_layers, tf.shape(x)[1], self.n_units), tf.float32) out = cell(x, initial_state_h, initial_state_c, parameters, True) else: if self.learn_initial_states: initial_state = tf.get_variable("initial_state", self.n_units, tf.float32, tf.zeros_initializer()) initial_state = tf.tile(tf.expand_dims(tf.expand_dims(initial_state, 0), 0), [self.n_layers, tf.shape(x)[1], 1]) else: initial_state = tf.zeros((self.n_layers, tf.shape(x)[1], self.n_units), tf.float32) out = cell(x, initial_state, parameters, True) return out
def __init__(self, is_training, config, input_): self._input = input_ batch_size = input_.batch_size num_steps = input_.num_steps size = config.hidden_size vocab_size = config.vocab_size num_layers = config.num_layers if is_training == False and config.keep_prob < 1: dropout_rate = 1 - config.keep_prob else: dropout_rate = 0.0 dropout_rate = 0.0 with tf.device("/cpu:0"): embedding = tf.get_variable("embedding", [vocab_size, size], dtype=data_type()) inputs = tf.nn.embedding_lookup(embedding, input_.input_data) from tensorflow.contrib.cudnn_rnn.python.ops import cudnn_rnn_ops from tensorflow.python.framework import constant_op from tensorflow.python.ops import random_ops model = cudnn_rnn_ops.CudnnLSTM(num_layers, size, size, dropout=dropout_rate) params_size_t = model.params_size() init_state = tf.zeros([num_layers, batch_size, size], dtype=data_type(), name="Kit_Init_State") params = tf.get_variable( 'Kit_Parameters', initializer=tf.random_uniform([params_size_t], -config.init_scale, config.init_scale), validate_shape=False) inputs = tf.transpose(inputs, perm=[1, 0, 2]) output, output_h, output_c = model(is_training=is_training, input_data=inputs, input_h=init_state, input_c=init_state, params=params) output = tf.transpose(output, [1, 0, 2]) output = tf.reshape(output, [-1, size]) softmax_w = tf.get_variable("softmax_w", [size, vocab_size], dtype=data_type()) softmax_b = tf.get_variable("softmax_b", [vocab_size], dtype=data_type()) logits = tf.nn.xw_plus_b(output, softmax_w, softmax_b) # Reshape logits to be 3-D tensor for sequence loss logits = tf.reshape(logits, [batch_size, num_steps, vocab_size]) # use the contrib sequence loss and average over the batches loss = tf.contrib.seq2seq.sequence_loss(logits, input_.targets, tf.ones( [batch_size, num_steps], dtype=data_type()), average_across_timesteps=False, average_across_batch=True) # update the cost variables self._cost = cost = tf.reduce_sum(loss) self._final_state = (tf.contrib.rnn.LSTMStateTuple(h=output_h, c=output_c)) if not is_training: return self._lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), config.max_grad_norm) optimizer = tf.train.GradientDescentOptimizer(self._lr) self._train_op = optimizer.apply_gradients( zip(grads, tvars), global_step=tf.contrib.framework.get_or_create_global_step()) self._new_lr = tf.placeholder(tf.float32, shape=[], name="new_learning_rate") self._lr_update = tf.assign(self._lr, self._new_lr)
def cudnn_rnn_wrapper(input_data, rnn_mode, num_layers, num_units, input_size, variable_name, direction="unidirectional", time_major=False, dropout=0.0): if rnn_mode == "lstm": model = cudnn_rnn_ops.CudnnLSTM(num_layers, num_units, input_size, direction=direction, dropout=dropout) elif rnn_mode == "gru": model = cudnn_rnn_ops.CudnnGRU(num_layers, num_units, input_size, direction=direction, dropout=dropout) else: raise ValueError("Invalid rnn_mode: %s" % rnn_mode) # Compute the total size of RNN params (Tensor) params_size_ts = model.params_size() params = tf.Variable(tf.random_uniform([params_size_ts], minval=-0.1, maxval=0.1), validate_shape=False, name=variable_name) if not time_major: batch_size_ts = tf.shape(input_data)[0] # batch size Tensor input_data = tf.transpose(input_data, [1, 0, 2]) else: batch_size_ts = tf.shape(input_data)[1] # batch size Tensor # NB: input_data should has shape [batch_size, num_timestep, d] if direction == "unidirectional": dir_count = 1 elif direction == "bidirectional": dir_count = 2 else: raise ValueError("Invalid direction: %s" % direction) init_h = tf.zeros( tf.stack([num_layers * dir_count, batch_size_ts, num_units])) has_input_c = (rnn_mode == "lstm") # Call the CudnnRNN if has_input_c: init_c = tf.zeros( tf.stack([num_layers * dir_count, batch_size_ts, num_units])) output, output_h, output_c = model(input_data=input_data, input_h=init_h, input_c=init_c, params=params) else: output, output_h = model(input_data=input_data, input_h=init_h, params=params) # output: [num_timestep, batch_size, num_units*dir_count] # output_h/c: [batch_size, num_units*dir_count] return output, output_h
def __init__(self, is_training, batch_size, num_unrollings, vocab_size, hidden_size, max_grad_norm, embedding_size, num_layers, learning_rate, model, dropout=0.0, input_dropout=0.0, use_batch=True): self.batch_size = batch_size self.num_unrollings = num_unrollings if not use_batch: self.batch_size = 1 self.num_unrollings = 1 self.hidden_size = hidden_size self.vocab_size = vocab_size self.max_grad_norm = max_grad_norm self.num_layers = num_layers self.embedding_size = embedding_size self.model = model self.dropout = dropout self.input_dropout = input_dropout if embedding_size <= 0: self.input_size = vocab_size # Don't do dropout on one hot representation. self.input_dropout = 0.0 else: self.input_size = embedding_size self.model_size = ( embedding_size * vocab_size + # embedding parameters # lstm parameters 4 * hidden_size * (hidden_size + self.input_size + 1) + # softmax parameters vocab_size * (hidden_size + 1) + # multilayer lstm parameters for extra layers. (num_layers - 1) * 4 * hidden_size * (hidden_size + hidden_size + 1)) # self.decay_rate = decay_rate # Placeholder to feed in input and targets/labels data. self.input_data = tf.placeholder( tf.int64, [self.batch_size, self.num_unrollings], name='inputs') self.targets = tf.placeholder(tf.int64, [self.batch_size, self.num_unrollings], name='targets') ################################################# #NEED TO REPLACE ALL CELL CODE # if self.model == 'rnn': # cell_fn = tf.contrib.rnn.BasicRNNCell # elif self.model == 'lstm': # cell_fn = tf.contrib.rnn.BasicLSTMCell # elif self.model == 'gru': # cell_fn = tf.contrib.rnn.GRUCell # # params = {'input_size': self.input_size} # params = {} # if self.model == 'lstm': # # add bias to forget gate in lstm. # params['forget_bias'] = 0.0 # params['state_is_tuple'] = True # # Create multilayer cell. # cell = cell_fn( # self.hidden_size, reuse=tf.get_variable_scope().reuse, # **params) # cells = [cell] # # params['input_size'] = self.hidden_size # # more explicit way to create cells for MultiRNNCell than # # [higher_layer_cell] * (self.num_layers - 1) # for i in range(self.num_layers-1): # higher_layer_cell = cell_fn( # self.hidden_size, reuse=tf.get_variable_scope().reuse, # **params) # cells.append(higher_layer_cell) # if is_training and self.dropout > 0: # cells = [tf.contrib.rnn.DropoutWrapper( # cell, # output_keep_prob=1.0-self.dropout) # for cell in cells] # multi_cell = tf.contrib.rnn.MultiRNNCell(cells) # with tf.name_scope('initial_state'): # # zero_state is used to compute the intial state for cell. # self.zero_state = multi_cell.zero_state(self.batch_size, tf.float32) # # Placeholder to feed in initial state. # # self.initial_state = tf.placeholder( # # tf.float32, # # [self.batch_size, multi_cell.state_size], # # 'initial_state') # self.initial_state = create_tuple_placeholders_with_default( # multi_cell.zero_state(batch_size, tf.float32), # extra_dims=(None,), # shape=multi_cell.state_size) ######## MIGHT NEED THIS STUFF ################## # Embeddings layers. with tf.name_scope('embedding_layer'): if embedding_size > 0: self.embedding = tf.get_variable( 'embedding', [self.vocab_size, self.embedding_size]) else: self.embedding = tf.constant(np.eye(self.vocab_size), dtype=tf.float32) inputs = tf.nn.embedding_lookup(self.embedding, self.input_data) if is_training and self.input_dropout > 0: inputs = tf.nn.dropout(inputs, 1 - self.input_dropout) with tf.name_scope('slice_inputs'): # Slice inputs into a list of shape [batch_size, 1] data colums. sliced_inputs = [ tf.squeeze(input_, [1]) for input_ in tf.split(axis=1, num_or_size_splits=self.num_unrollings, value=inputs) ] # Copy cell to do unrolling and collect outputs. # outputs, final_state = tf.contrib.rnn.static_rnn( # multi_cell, sliced_inputs, # initial_state=self.initial_state) ######################## #Insert MIOPEN if self.model == 'lstm': model = cudnn_rnn_ops.CudnnLSTM(self.num_layers, self.hidden_size, self.embedding_size, dropout=self.dropout) elif self.model == 'gru': model = cudnn_rnn_ops.CudnnGRU(self.num_layers, self.hidden_size, self.embedding_size, dropout=self.dropout) elif self.model == 'rnn': model = cudnn_rnn_ops.CudnnRNNTanh(self.num_layers, self.hidden_size, self.embedding_size, dropout=self.dropout) else: raise ValueError("Invalid model: %s" % self.model) # Set zero init input states input_h = constant_op.constant(np.zeros( [self.num_layers, self.num_unrollings, self.hidden_size]), dtype=tf.float32) has_input_c = (self.model == 'lstm') if has_input_c: input_c = constant_op.constant(np.zeros( [self.num_layers, self.num_unrollings, self.hidden_size]), dtype=tf.float32) # Set rnn params params_size_t = model.params_size() rand_params = random_ops.random_uniform(params_size_t.shape) print "PARAMS size" print params_size_t print rand_params.shape print "Input sizes" print input_h print input_c print "Batch size" print batch_size print "Hidden size" print self.hidden_size #rand_params.set_shape(params_size_t.shape); params = variables.Variable(rand_params, validate_shape=True) args = { "input_data": inputs, "input_h": input_h, "params": params, "is_training": is_training } if has_input_c: args["input_c"] = input_c # Build cell if (self.model == 'lstm'): outputs, final_state, final_cell = model(input_data=inputs, input_h=input_h, input_c=input_c, params=params) else: outputs, final_state, final_cell = model(input_data=inputs, input_h=input_h, params=params) # model(**args) self.zero_state = state_ops.assign( params, array_ops.zeros(params_size_t.shape)) self.initial_state = create_tuple_placeholders_with_default( self.zero_state, extra_dims=(None, ), shape=params_size_t.shape) print "Initial State" print self.initial_state ######################## self.final_state = final_state with tf.name_scope('flatten_ouputs'): # Flatten the outputs into one dimension. flat_outputs = tf.reshape(tf.concat(axis=1, values=outputs), [-1, hidden_size]) with tf.name_scope('flatten_targets'): # Flatten the targets too. flat_targets = tf.reshape(tf.concat(axis=1, values=self.targets), [-1]) # Create softmax parameters, weights and bias. with tf.variable_scope('softmax') as sm_vs: softmax_w = tf.get_variable("softmax_w", [hidden_size, vocab_size]) softmax_b = tf.get_variable("softmax_b", [vocab_size]) self.logits = tf.matmul(flat_outputs, softmax_w) + softmax_b self.probs = tf.nn.softmax(self.logits) with tf.name_scope('loss'): # Compute mean cross entropy loss for each output. loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=self.logits, labels=flat_targets) self.mean_loss = tf.reduce_mean(loss) with tf.name_scope('loss_monitor'): # Count the number of elements and the sum of mean_loss # from each batch to compute the average loss. count = tf.Variable(1.0, name='count') sum_mean_loss = tf.Variable(1.0, name='sum_mean_loss') self.reset_loss_monitor = tf.group(sum_mean_loss.assign(0.0), count.assign(0.0), name='reset_loss_monitor') self.update_loss_monitor = tf.group( sum_mean_loss.assign(sum_mean_loss + self.mean_loss), count.assign(count + 1), name='update_loss_monitor') with tf.control_dependencies([self.update_loss_monitor]): self.average_loss = sum_mean_loss / count self.ppl = tf.exp(self.average_loss) # Monitor the loss. loss_summary_name = "average loss" ppl_summary_name = "perplexity" average_loss_summary = tf.summary.scalar(loss_summary_name, self.average_loss) ppl_summary = tf.summary.scalar(ppl_summary_name, self.ppl) # Monitor the loss. self.summaries = tf.summary.merge([average_loss_summary, ppl_summary], name='loss_monitor') self.global_step = tf.get_variable( 'global_step', [], initializer=tf.constant_initializer(0.0)) self.learning_rate = tf.constant(learning_rate) if is_training: # learning_rate = tf.train.exponential_decay(1.0, self.global_step, # 5000, 0.1, staircase=True) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm( tf.gradients(self.mean_loss, tvars), self.max_grad_norm) # optimizer = tf.train.GradientDescentOptimizer(learning_rate) # optimizer = tf.train.RMSPropOptimizer(learning_rate, decay_rate) optimizer = tf.train.AdamOptimizer(self.learning_rate) self.train_op = optimizer.apply_gradients( zip(grads, tvars), global_step=self.global_step)