def _get_outputs(self, inputs, seq_lengths): """compute the evaluation logits for a batch of data Args: inputs: the inputs to the neural network, this is a dictionary of [batch_size x ...] tensors seq_length: The sequence lengths of the input utterances, this is a list of [batch_size] vectors Returns: the logits""" with tf.name_scope('evaluate_logits'): logits, _ = run_multi_model.run_multi_model( models=self.models, model_nodes=self.model_nodes, model_links=self.model_links, inputs=inputs, inputs_links=self.inputs_links, nodes_output_names=self.nodes_output_names, output_names=self.output_names, seq_lengths=seq_lengths, is_training=False) return logits
def gather_grads(self, optimizer): """ Gather gradients for this task""" with tf.variable_scope(self.task_name): # a variable to hold the batch loss self.batch_loss = tf.get_variable( name='batch_loss', shape=[], dtype=tf.float32, initializer=tf.constant_initializer(0), trainable=False) # a variable to hold the batch loss norm self.batch_loss_norm = tf.get_variable( name='batch_loss_norm', shape=[], dtype=tf.float32, initializer=tf.constant_initializer(0), trainable=False) # normalize the loss with tf.variable_scope('normalize_loss'): self.normalized_loss = self.batch_loss / self.batch_loss_norm self.process_minibatch = [] for set_ind, linkedset in enumerate(self.linkedsets): inputs = dict() seq_lengths = dict() targets = dict() # create the input pipeline data, seq_length = input_pipeline.input_pipeline( data_queue=self.data_queue[linkedset], batch_size=self.batch_size, numbuckets=int(self.trainerconf['numbuckets']), dataconfs=self.input_dataconfs[linkedset] + self.target_dataconfs[linkedset]) # split data into inputs and targets for ind, input_name in enumerate(self.input_names): inputs[input_name] = data[ind] seq_lengths[input_name] = seq_length[ind] for ind, target_name in enumerate(self.target_names): targets[target_name] = data[len(self.input_names) + ind] # get the logits logits, used_models = run_multi_model.run_multi_model( models=self.models, model_nodes=self.model_nodes, model_links=self.model_links, inputs=inputs, inputs_links=self.inputs_links, nodes_output_names=self.nodes_output_names, output_names=self.output_names, seq_lengths=seq_lengths, is_training=True) # compute the loss task_minibatch_loss, task_minibatch_loss_norm = self.loss_computer( targets, logits, seq_lengths) task_minibatch_loss *= self.linkedset_weighting[linkedset] task_minibatch_loss_norm *= self.linkedset_weighting[linkedset] used_variables = run_multi_model.get_variables(used_models) task_minibatch_grads_and_vars = optimizer.compute_gradients( task_minibatch_loss, var_list=used_variables) (task_minibatch_grads, task_vars) = zip(*task_minibatch_grads_and_vars) if set_ind == 0: # # This should have already been done before the loop, but then the trainable parameters where unknown # # gather all trainable parameters # self.params = tf.trainable_variables() self.params = task_vars # a variable to hold all the gradients self.grads = [ tf.get_variable(param.op.name, param.get_shape().as_list(), initializer=tf.constant_initializer(0), trainable=False) for param in self.params ] # update the batch gradients with the minibatch gradients. # If a minibatchgradients is None, the loss does not depent on the specific # variable(s) and it will thus not be updated with tf.variable_scope('update_gradients_%s' % linkedset): update_gradients = [ grad.assign_add(batchgrad) for batchgrad, grad in zip( task_minibatch_grads, self.grads) if batchgrad is not None ] acc_loss = self.batch_loss.assign_add(task_minibatch_loss) acc_loss_norm = self.batch_loss_norm.assign_add( task_minibatch_loss_norm) # group all the operations together that need to be executed to process # a minibatch self.process_minibatch.append( tf.group(*(update_gradients + [acc_loss] + [acc_loss_norm]), name='update_grads_loss_norm_%s' % linkedset)) reset_batch_loss = self.batch_loss.assign(0.0) reset_batch_loss_norm = self.batch_loss_norm.assign(0.0) reset_grad = tf.variables_initializer(self.grads) # normalize the gradients if requested. with tf.variable_scope('normalize_gradients'): if self.trainerconf['normalize_gradients'] == 'True': self.normalize_gradients = [ grad.assign(tf.divide(grad, self.batch_loss_norm)) for grad in self.grads ] else: self.normalize_gradients = [ grad.assign(grad) for grad in self.grads ] # an op to reset the grads, the loss and the loss norm self.reset_grad_loss_norm = tf.group( *([reset_grad, reset_batch_loss, reset_batch_loss_norm]), name='reset_grad_loss_norm') batch_grads_and_vars = zip(self.grads, task_vars) with tf.variable_scope('clip'): clip_value = float(self.trainerconf['clip_grad_value']) # clip the gradients batch_grads_and_vars = [ (tf.clip_by_value(grad, -clip_value, clip_value), var) for grad, var in batch_grads_and_vars ] return batch_grads_and_vars
def train(self, learning_rate): '''set the training ops for this task''' with tf.variable_scope(self.task_name): #create the optimizer optimizer = tf.train.AdamOptimizer(learning_rate) inputs = dict() seq_lengths = dict() targets = dict() for linkedset in self.linkedsets: #create the input pipeline data, seq_length = input_pipeline.input_pipeline( data_queue=self.data_queue[linkedset], batch_size=self.batch_size, numbuckets=int(self.trainerconf['numbuckets']), dataconfs=self.input_dataconfs[linkedset] + self.target_dataconfs[linkedset]) #split data into inputs and targets for ind, input_name in enumerate( self.linkedsets[linkedset]['inputs']): inputs[input_name] = data[ind] seq_lengths[input_name] = seq_length[ind] for ind, target_name in enumerate( self.linkedsets[linkedset]['targets']): targets[target_name] = data[ len(self.linkedsets[linkedset]['inputs']) + ind] #get the logits logits = run_multi_model.run_multi_model( models=self.models, model_nodes=self.model_nodes, model_links=self.model_links, inputs=inputs, inputs_links=self.inputs_links, output_names=self.output_names, seq_lengths=seq_lengths, is_training=True) #TODO: The proper way to exploit data paralellism is via the #SyncReplicasOptimizer defined below. However for some reason it hangs #and I have not yet found a solution for it. For the moment the gradients #are accumulated in a way that does not allow data paralellism and there # is no advantage on having multiple workers. #create an optimizer that aggregates gradients #if int(conf['numbatches_to_aggregate']) > 0: #optimizer = tf.train.SyncReplicasOptimizer( #opt=optimizer, #replicas_to_aggregate=int( #conf['numbatches_to_aggregate'])#, ##total_num_replicas=num_replicas #) #a variable to hold the batch loss self.batch_loss = tf.get_variable( name='batch_loss', shape=[], dtype=tf.float32, initializer=tf.constant_initializer(0), trainable=False) reset_batch_loss = self.batch_loss.assign(0.0) #a variable to hold the batch loss norm self.batch_loss_norm = tf.get_variable( name='batch_loss_norm', shape=[], dtype=tf.float32, initializer=tf.constant_initializer(0), trainable=False) reset_batch_loss_norm = self.batch_loss_norm.assign(0.0) #gather all trainable parameters self.params = tf.trainable_variables() #a variable to hold all the gradients self.grads = [ tf.get_variable(param.op.name, param.get_shape().as_list(), initializer=tf.constant_initializer(0), trainable=False) for param in self.params ] reset_grad = tf.variables_initializer(self.grads) #compute the loss task_minibatch_loss, task_minibatch_loss_norm = self.loss_computer( targets, logits, seq_lengths) task_minibatch_grads_and_vars = optimizer.compute_gradients( task_minibatch_loss) (task_minibatch_grads, task_vars) = zip(*task_minibatch_grads_and_vars) #update the batch gradients with the minibatch gradients. #If a minibatchgradients is None, the loss does not depent on the specific #variable(s) and it will thus not be updated with tf.variable_scope('update_gradients'): update_gradients = [ grad.assign_add(batchgrad) for batchgrad, grad in zip( task_minibatch_grads, self.grads) if batchgrad is not None ] acc_loss = self.batch_loss.assign_add(task_minibatch_loss) acc_loss_norm = self.batch_loss_norm.assign_add( task_minibatch_loss_norm) #group all the operations together that need to be executed to process #a minibatch self.process_minibatch = tf.group(*(update_gradients + [acc_loss] + [acc_loss_norm]), name='update_grads_loss_norm') #an op to reset the grads, the loss and the loss norm self.reset_grad_loss_norm = tf.group( *([reset_grad, reset_batch_loss, reset_batch_loss_norm]), name='reset_grad_loss_norm') #normalize the loss with tf.variable_scope('normalize_loss'): self.normalized_loss = self.batch_loss / self.batch_loss_norm #normalize the gradients if requested. with tf.variable_scope('normalize_gradients'): if self.trainerconf['normalize_gradients'] == 'True': self.normalize_gradients = [ grad.assign(tf.divide(grad, self.batch_loss_norm)) for grad in self.grads ] else: self.normalize_gradients = [ grad.assign(grad) for grad in self.grads ] batch_grads_and_vars = zip(self.grads, task_vars) with tf.variable_scope('clip'): clip_value = float(self.trainerconf['clip_grad_value']) #clip the gradients batch_grads_and_vars = [ (tf.clip_by_value(grad, -clip_value, clip_value), var) for grad, var in batch_grads_and_vars ] #an op to apply the accumulated gradients to the variables self.apply_gradients = optimizer.apply_gradients( grads_and_vars=batch_grads_and_vars, name='apply_gradients')