def create_tf_operations(self, config): super(PolicyGradientModel, self).create_tf_operations(config) with tf.variable_scope('value_function'): network_builder = util.get_function(fct=config.network) self.network = NeuralNetwork(network_builder=network_builder, inputs=self.state) self.internal_inputs.extend(self.network.internal_inputs) self.internal_outputs.extend(self.network.internal_outputs) self.internal_inits.extend(self.network.internal_inits) with tf.variable_scope('distribution'): for action, distribution in self.distribution.items(): with tf.variable_scope(action): distribution.create_tf_operations( x=self.network.output, deterministic=self.deterministic) self.action_taken[action] = distribution.sample() if self.baseline: with tf.variable_scope('baseline'): # Generate one baseline per state input, later average their predictions for name, state in config.states: self.baseline[name].create_tf_operations( state, scope='baseline_' + name)
def create_tf_operations(self, config): super(QModel, self).create_tf_operations(config) # Placeholders with tf.variable_scope('placeholder'): self.next_state = dict() for name, state in config.states.items(): self.next_state[name] = tf.placeholder(dtype=util.tf_dtype(state.type), shape=(None,) + tuple(state.shape), name=('next_' + name)) network_builder = util.get_function(fct=config.network) # Training network with tf.variable_scope('training') as training_scope: self.training_network = NeuralNetwork(network_builder=network_builder, inputs=self.state, summary_level=config.tf_summary_level) self.network_internal_index = len(self.internal_inputs) self.internal_inputs.extend(self.training_network.internal_inputs) self.internal_outputs.extend(self.training_network.internal_outputs) self.internal_inits.extend(self.training_network.internal_inits) self.q_values = self.create_training_operations(config) self.training_variables = tf.contrib.framework.get_variables(scope=training_scope) # Target network with tf.variable_scope('target') as target_scope: self.target_network = NeuralNetwork(network_builder=network_builder, inputs=self.next_state) self.next_internal_inputs = list(self.target_network.internal_inputs) self.target_values = self.create_target_operations(config) self.target_variables = tf.contrib.framework.get_variables(scope=target_scope) with tf.name_scope('update'): deltas = self.create_q_deltas(config) # Surrogate loss as the mean squared error between actual observed rewards and expected rewards delta = tf.reduce_mean(input_tensor=tf.concat(values=deltas, axis=1), axis=1) self.loss_per_instance = tf.square(delta) # If loss clipping is used, calculate the huber loss if config.clip_loss > 0.0: huber_loss = tf.where(condition=(tf.abs(delta) < config.clip_loss), x=(0.5 * self.loss_per_instance), y=config.clip_loss * tf.abs(delta) - 0.5 * config.clip_loss ** 2) self.q_loss = tf.reduce_mean(input_tensor=huber_loss, axis=0) else: self.q_loss = tf.reduce_mean(input_tensor=self.loss_per_instance, axis=0) tf.losses.add_loss(self.q_loss) # for each loss over an action create a summary if len(self.q_loss.shape) > 1: for action_ind in range(self.q_loss.shape[1]): tf.summary.scalar('q-loss-action-{}'.format(action_ind), self.q_loss[action_ind]) else: tf.summary.scalar('q-loss', self.q_loss) # Update target network with tf.name_scope('update-target'): self.target_network_update = list() for v_source, v_target in zip(self.training_variables, self.target_variables): update = v_target.assign_sub(config.update_target_weight * (v_target - v_source)) self.target_network_update.append(update)
def create_tf_operations(self, config): super(CategoricalDQNModel, self).create_tf_operations(config) # Placeholders with tf.variable_scope('placeholder'): self.next_state = dict() for name, state in config.states.items(): self.next_state[name] = tf.placeholder( dtype=util.tf_dtype(state.type), shape=(None, ) + tuple(state.shape), name=name) # setup constants delta_z and z. z represents the discretized scaling over vmin -> vmax scaling_increment = (self.distribution_max - self.distribution_min) / ( self.num_atoms - 1) # delta_z in the paper quantized_steps = self.distribution_min + np.arange( self.num_atoms) * scaling_increment # z in the paper num_actions = { name: action.num_actions for name, action in config.actions } # creating networks network_builder = util.get_function(fct=config.network) # Training network with tf.variable_scope('training') as training_scope: self.training_network = NeuralNetwork( network_builder=network_builder, inputs=self.state, summary_level=config.tf_summary_level) self.network_internal_index = len(self.internal_inputs) self.internal_inputs.extend(self.training_network.internal_inputs) self.internal_outputs.extend( self.training_network.internal_outputs) self.internal_inits.extend(self.training_network.internal_inits) training_output_logits, training_output_probabilities, training_qval, action_taken = self._create_action_outputs( self.training_network.output, quantized_steps, self.num_atoms, config, self.action, num_actions) # stack to preserve action_taken shape like (batch_size, num_actions) for action in self.action: if len(action_taken[action]) > 1: self.action_taken[action] = tf.stack(action_taken[action], axis=1) else: self.action_taken[action] = action_taken[action][0] # summarize expected reward histogram if config.tf_summary_level >= 1: for action_shaped in range(len(action_taken[action])): for action_ind in range(num_actions[action]): tf.summary.histogram( '{}-{}-{}-output-distribution'.format( action, action_shaped, action_ind), training_output_probabilities[action] [action_shaped][:, action_ind] * quantized_steps) self.training_variables = tf.contrib.framework.get_variables( scope=training_scope) # Target network with tf.variable_scope('target') as target_scope: self.target_network = NeuralNetwork( network_builder=network_builder, inputs=self.next_state) self.next_internal_inputs = list( self.target_network.internal_inputs) _, target_output_probabilities, target_qval, target_action = self._create_action_outputs( self.target_network.output, quantized_steps, self.num_atoms, config, self.action, num_actions) self.target_variables = tf.contrib.framework.get_variables( scope=target_scope) with tf.name_scope('update'): # broadcast rewards and discounted quantization. Shape (batchsize, num_atoms). T_z_j in the paper reward = tf.expand_dims(self.reward, axis=1) terminal = tf.expand_dims(tf.cast(x=self.terminal, dtype=tf.float32), axis=1) broadcasted_rewards = reward + (1.0 - terminal) * ( quantized_steps * self.discount) # clip into distribution_min, distribution_max quantized_discounted_reward = tf.clip_by_value( broadcasted_rewards, self.distribution_min, self.distribution_max) # compute quantization indecies. b, l, u in the paper closest_quantization = (quantized_discounted_reward - self.distribution_min) / scaling_increment lower_ind = tf.floor(closest_quantization) upper_ind = tf.ceil(closest_quantization) # create shared selections for later use dynamic_batch_size = tf.shape(self.reward)[0] batch_selection = tf.range(0, dynamic_batch_size) # tile expects a tensor of same shape, we are just repeating the selection num_atoms times across the last dimension batch_tiled_selection = tf.reshape( tf.tile(tf.reshape(batch_selection, (-1, 1)), [1, self.num_atoms]), [-1]) # combine with lower and upper ind, same as zip(flatten(batch_tiled_selection), flatten(lower_ind)) # also cast to int32 to use as index batch_lower_inds = tf.stack( (batch_tiled_selection, tf.reshape(tf.cast(lower_ind, tf.int32), [-1])), axis=1) batch_upper_inds = tf.stack( (batch_tiled_selection, tf.reshape(tf.cast(upper_ind, tf.int32), [-1])), axis=1) # create loss for each action for action in self.action: # if shape of action != () we need to process each action head separately for action_ind in range( max([util.prod(config.actions[action].shape), 1])): # project onto the supports # tensorflow indexing is still not great, we stack these two and use gather_nd later target_batch_action_selection = tf.stack( (batch_selection, target_action[action][action_ind]), axis=1) # distribute probability scaled by distance # in numpy the equivalent is target_output_probabilities[action][batch_selection, target_action] target_probabilities_of_action = tf.gather_nd( target_output_probabilities[action][action_ind], target_batch_action_selection) distance_lower = target_probabilities_of_action * ( closest_quantization - lower_ind) distance_upper = target_probabilities_of_action * ( upper_ind - closest_quantization) # sum distances aligned into quantized bins. m in the paper # scatter_nd actually sums the values into a zeros tensor instead of overwriting # this is pretty much a huge hack refer to https://github.com/tensorflow/tensorflow/issues/8102 target_quantized_probabilities_lower = tf.scatter_nd( batch_lower_inds, tf.reshape(distance_lower, [-1]), (dynamic_batch_size, self.num_atoms)) target_quantized_probabilities_upper = tf.scatter_nd( batch_upper_inds, tf.reshape(distance_upper, [-1]), (dynamic_batch_size, self.num_atoms)) # no gradient should flow back to the target network target_quantized_probabilities = tf.stop_gradient( target_quantized_probabilities_lower + target_quantized_probabilities_upper) # we must check if input action has shape if len(self.action[action].shape) > 1: input_action = self.action[action][:, action_ind] else: input_action = self.action[action] # now we have target probabilities loss is categorical cross entropy using logits # compare to the actions we actually took training_action_selection = tf.stack( (batch_selection, input_action), axis=1) probabilities_for_action = tf.gather_nd( training_output_probabilities[action][action_ind], training_action_selection) self.loss_per_instance = -tf.reduce_sum( target_quantized_probabilities * tf.log(probabilities_for_action + util.epsilon), axis=-1) loss = tf.reduce_mean(self.loss_per_instance) tf.losses.add_loss(loss) tf.summary.scalar( 'cce-loss-{}-{}'.format(action, action_ind), loss) # Update target network with tf.name_scope("update_target"): self.target_network_update = list() for v_source, v_target in zip(self.training_variables, self.target_variables): update = v_target.assign_sub(config.update_target_weight * (v_target - v_source)) self.target_network_update.append(update)
def create_tf_operations(self, config): super(NAFModel, self).create_tf_operations(config) num_actions = sum( util.prod(config.actions[name].shape) for name in sorted(self.action)) # Get hidden layers from network generator, then add NAF outputs, same for target network with tf.variable_scope('training'): network_builder = util.get_function(fct=config.network) self.training_network = NeuralNetwork( network_builder=network_builder, inputs=self.state) self.internal_inputs.extend(self.training_network.internal_inputs) self.internal_outputs.extend( self.training_network.internal_outputs) self.internal_inits.extend(self.training_network.internal_inits) with tf.variable_scope('training_outputs') as scope: # Action outputs flat_mean = layers['linear'](x=self.training_network.output, size=num_actions) n = 0 for name in sorted(self.action): shape = config.actions[name].shape self.action_taken[name] = tf.reshape( tensor=flat_mean[:, n:n + util.prod(shape)], shape=((-1, ) + shape)) n += util.prod(shape) # Advantage computation # Network outputs entries of lower triangular matrix L lower_triangular_size = num_actions * (num_actions + 1) // 2 l_entries = layers['linear'](x=self.training_network.output, size=lower_triangular_size) l_matrix = tf.exp( x=tf.map_fn(fn=tf.diag, elems=l_entries[:, :num_actions])) if num_actions > 1: offset = num_actions l_columns = list() for zeros, size in enumerate(xrange(num_actions - 1, -1, -1), 1): column = tf.pad(tensor=l_entries[:, offset:offset + size], paddings=((0, 0), (zeros, 0))) l_columns.append(column) offset += size l_matrix += tf.stack(values=l_columns, axis=1) # P = LL^T p_matrix = tf.matmul(a=l_matrix, b=tf.transpose(a=l_matrix, perm=(0, 2, 1))) flat_action = list() for name in sorted(self.action): shape = config.actions[name].shape flat_action.append( tf.reshape(tensor=self.action[name], shape=(-1, util.prod(shape)))) flat_action = tf.concat(values=flat_action, axis=1) difference = flat_action - flat_mean # A = -0.5 (a - mean)P(a - mean) advantage = tf.matmul(a=p_matrix, b=tf.expand_dims(input=difference, axis=2)) advantage = tf.matmul(a=tf.expand_dims(input=difference, axis=1), b=advantage) advantage = tf.squeeze(input=(-advantage / 2.0), axis=2) # Q = A + V # State-value function value = layers['linear'](x=self.training_network.output, size=num_actions) q_value = value + advantage training_output_vars = tf.contrib.framework.get_variables( scope=scope) with tf.variable_scope('target'): network_builder = util.get_function(fct=config.network) self.target_network = NeuralNetwork( network_builder=network_builder, inputs=self.state) self.internal_inputs.extend(self.target_network.internal_inputs) self.internal_outputs.extend(self.target_network.internal_outputs) self.internal_inits.extend(self.target_network.internal_inits) with tf.variable_scope('target_outputs') as scope: # State-value function target_value = layers['linear'](x=self.target_network.output, size=num_actions) target_output_vars = tf.contrib.framework.get_variables( scope=scope) with tf.name_scope('update'): reward = tf.expand_dims(input=self.reward[:-1], axis=1) terminal = tf.expand_dims(input=tf.cast(x=self.terminal[:-1], dtype=tf.float32), axis=1) q_target = reward + (1.0 - terminal) * config.discount * target_value[1:] delta = q_target - q_value[:-1] delta = tf.reduce_mean(input_tensor=delta, axis=1) self.loss_per_instance = tf.square(x=delta) # We observe issues with numerical stability in some tests, gradient clipping can help if config.clip_gradients > 0.0: huber_loss = tf.where( condition=(tf.abs(delta) < config.clip_gradients), x=(0.5 * self.loss_per_instance), y=(tf.abs(delta) - 0.5)) loss = tf.reduce_mean(input_tensor=huber_loss, axis=0) else: loss = tf.reduce_mean(input_tensor=self.loss_per_instance, axis=0) tf.losses.add_loss(loss) with tf.name_scope('update_target'): # Combine hidden layer variables and output layer variables training_vars = self.training_network.variables + training_output_vars target_vars = self.target_network.variables + target_output_vars self.target_network_update = list() for v_source, v_target in zip(training_vars, target_vars): update = v_target.assign_sub(config.update_target_weight * (v_target - v_source)) self.target_network_update.append(update)
def create_tf_operations(self, config): super(QModel, self).create_tf_operations(config) # Placeholders with tf.variable_scope('placeholder'): self.next_state = dict() for name, state in config.states.items(): self.next_state[name] = tf.placeholder( dtype=util.tf_dtype(state.type), shape=(None, ) + tuple(state.shape), name=name) network_builder = util.get_function(fct=config.network) # Training network with tf.variable_scope('training') as training_scope: self.training_network = NeuralNetwork( network_builder=network_builder, inputs=self.state) self.internal_inputs.extend(self.training_network.internal_inputs) self.internal_outputs.extend( self.training_network.internal_outputs) self.internal_inits.extend(self.training_network.internal_inits) self.q_values = self.create_training_operations(config) self.training_variables = tf.contrib.framework.get_variables( scope=training_scope) # Target network with tf.variable_scope('target') as target_scope: self.target_network = NeuralNetwork( network_builder=network_builder, inputs=self.next_state) self.internal_inputs.extend(self.target_network.internal_inputs) self.internal_outputs.extend(self.target_network.internal_outputs) self.internal_inits.extend(self.target_network.internal_inits) self.target_values = self.create_target_operations(config) self.target_variables = tf.contrib.framework.get_variables( scope=target_scope) with tf.name_scope('update'): deltas = list() terminal_float = tf.cast(x=self.terminal, dtype=tf.float32) for name, action in self.action.items(): reward = self.reward terminal = terminal_float for _ in range(len(config.actions[name].shape)): reward = tf.expand_dims(input=reward, axis=1) terminal = tf.expand_dims(input=terminal, axis=1) q_target = reward + ( 1.0 - terminal) * config.discount * self.target_values[name] delta = tf.stop_gradient(q_target) - self.q_values[name] delta = tf.reshape( tensor=delta, shape=(-1, util.prod(config.actions[name].shape))) deltas.append(delta) # Surrogate loss as the mean squared error between actual observed rewards and expected rewards delta = tf.reduce_mean(input_tensor=tf.concat(values=deltas, axis=1), axis=1) self.loss_per_instance = tf.square(delta) # If loss clipping is used, calculate the huber loss if config.clip_loss > 0.0: huber_loss = tf.where( condition=(tf.abs(delta) < config.clip_gradients), x=(0.5 * self.loss_per_instance), y=(tf.abs(delta) - 0.5)) self.q_loss = tf.reduce_mean(input_tensor=huber_loss, axis=0) else: self.q_loss = tf.reduce_mean( input_tensor=self.loss_per_instance, axis=0) tf.losses.add_loss(self.q_loss) # Update target network with tf.name_scope('update-target'): self.target_network_update = list() for v_source, v_target in zip(self.training_variables, self.target_variables): update = v_target.assign_sub(config.update_target_weight * (v_target - v_source)) self.target_network_update.append(update)
def create_tf_operations(self, config): super(DQNModel, self).create_tf_operations(config) num_actions = {name: action.num_actions for name, action in config.actions} # Training network with tf.variable_scope('training'): network_builder = util.get_function(fct=config.network) self.training_network = NeuralNetwork(network_builder=network_builder, inputs=self.state) self.internal_inputs.extend(self.training_network.internal_inputs) self.internal_outputs.extend(self.training_network.internal_outputs) self.internal_inits.extend(self.training_network.internal_inits) training_output = dict() for action in self.action: training_output[action] = layers['linear'](x=self.training_network.output, size=num_actions[action]) self.action_taken[action] = tf.argmax(training_output[action], axis=1) # Target network with tf.variable_scope('target'): network_builder = util.get_function(fct=config.network) self.target_network = NeuralNetwork(network_builder=network_builder, inputs=self.state) self.internal_inputs.extend(self.target_network.internal_inputs) self.internal_outputs.extend(self.target_network.internal_outputs) self.internal_inits.extend(self.target_network.internal_inits) target_value = dict() for action in self.action: target_output = layers['linear'](x=self.target_network.output, size=num_actions[action]) if config.double_dqn: selector = tf.one_hot(self.action_taken[action], num_actions[action]) target_value[action] = tf.reduce_sum(tf.multiply(target_output, selector), axis=1) else: target_value[action] = tf.reduce_max(target_output, axis=1) with tf.name_scope('update'): for action in self.action: # One_hot tensor of the actions that have been taken action_one_hot = tf.one_hot(self.action[action][:-1], num_actions[action]) # Training output, so we get the expected rewards given the actual states and actions q_value = tf.reduce_sum(training_output[action][:-1] * action_one_hot, axis=1) # Surrogate loss as the mean squared error between actual observed rewards and expected rewards q_target = self.reward[:-1] + (1.0 - tf.cast(self.terminal[:-1], tf.float32)) * self.discount * target_value[action][1:] delta = q_target - q_value self.loss_per_instance = tf.square(delta) # If gradient clipping is used, calculate the huber loss if config.clip_gradients > 0.0: huber_loss = tf.where(tf.abs(delta) < config.clip_gradients, 0.5 * self.loss_per_instance, tf.abs(delta) - 0.5) loss = tf.reduce_mean(huber_loss) else: loss = tf.reduce_mean(self.loss_per_instance) tf.losses.add_loss(loss) # Update target network with tf.name_scope("update_target"): self.target_network_update = list() for v_source, v_target in zip(self.training_network.variables, self.target_network.variables): update = v_target.assign_sub(config.update_target_weight * (v_target - v_source)) self.target_network_update.append(update)
def create_tf_operations(self, config): super(DQNModel, self).create_tf_operations(config) flat_action_sizes = { name: util.prod(action.shape) * action.num_actions for name, action in config.actions } action_shapes = { name: (-1, ) + action.shape + (action.num_actions, ) for name, action in config.actions } # Training network with tf.variable_scope('training'): network_builder = util.get_function(fct=config.network) self.training_network = NeuralNetwork( network_builder=network_builder, inputs=self.state) self.internal_inputs.extend(self.training_network.internal_inputs) self.internal_outputs.extend( self.training_network.internal_outputs) self.internal_inits.extend(self.training_network.internal_inits) self.training_output = dict() for action in self.action: output = layers['linear'](x=self.training_network.output, size=flat_action_sizes[action]) self.training_output[action] = tf.reshape( tensor=output, shape=action_shapes[action]) self.action_taken[action] = tf.argmax( self.training_output[action], axis=-1) # Target network with tf.variable_scope('target'): network_builder = util.get_function(fct=config.network) self.target_network = NeuralNetwork( network_builder=network_builder, inputs=self.state) self.internal_inputs.extend(self.target_network.internal_inputs) self.internal_outputs.extend(self.target_network.internal_outputs) self.internal_inits.extend(self.target_network.internal_inits) target_value = dict() for action in self.action: output = layers['linear'](x=self.target_network.output, size=flat_action_sizes[action]) output = tf.reshape(tensor=output, shape=action_shapes[action]) if config.double_dqn: selector = tf.one_hot(indices=self.action_taken[action], depth=action_shapes[action][1]) target_value[action] = tf.reduce_sum( input_tensor=(output * selector), axis=-1) else: target_value[action] = tf.reduce_max(input_tensor=output, axis=-1) with tf.name_scope('update'): self.actions_one_hot = dict() self.q_values = dict() deltas = list() for action in self.action: # One_hot tensor of the actions that have been taken self.actions_one_hot[action] = tf.one_hot( indices=self.action[action][:-1], depth=config.actions[action].num_actions) # Training output, so we get the expected rewards given the actual states and actions self.q_values[action] = tf.reduce_sum( input_tensor=(self.training_output[action][:-1] * self.actions_one_hot[action]), axis=-1) reward = self.reward[:-1] terminal = tf.cast(x=self.terminal[:-1], dtype=tf.float32) for _ in range(len(config.actions[action].shape)): reward = tf.expand_dims(input=reward, axis=1) terminal = tf.expand_dims(input=terminal, axis=1) # Surrogate loss as the mean squared error between actual observed rewards and expected rewards q_target = reward + ( 1.0 - terminal) * config.discount * target_value[action][1:] delta = q_target - self.q_values[action] ds_list = [delta] for _ in range(len(config.actions[action].shape)): ds_list = [ d for ds in ds_list for d in tf.unstack(value=ds, axis=1) ] deltas.extend(ds_list) delta = tf.add_n(inputs=deltas) / len(deltas) self.loss_per_instance = tf.square(delta) # If gradient clipping is used, calculate the huber loss if config.clip_loss > 0.0: huber_loss = tf.where( condition=(tf.abs(delta) < config.clip_gradients), x=(0.5 * self.loss_per_instance), y=(tf.abs(delta) - 0.5)) loss = tf.reduce_mean(input_tensor=huber_loss, axis=0) else: loss = tf.reduce_mean(input_tensor=self.loss_per_instance, axis=0) self.dqn_loss = loss tf.losses.add_loss(loss) # Update target network with tf.name_scope('update_target'): self.target_network_update = list() for v_source, v_target in zip(self.training_network.variables, self.target_network.variables): update = v_target.assign_sub(config.update_target_weight * (v_target - v_source)) self.target_network_update.append(update)
def create_tf_operations(self, config): super(NAFModel, self).create_tf_operations(config) # Get hidden layers from network generator, then add NAF outputs, same for target network with tf.variable_scope('training'): network_builder = util.get_function(fct=config.network) self.training_network = NeuralNetwork(network_builder=network_builder, inputs=self.state) self.internal_inputs.extend(self.training_network.internal_inputs) self.internal_outputs.extend(self.training_network.internal_outputs) self.internal_inits.extend(self.training_network.internal_inits) with tf.variable_scope('training_outputs'): num_actions = len(self.action) # Action outputs mean = layers['linear'](x=self.training_network.output, size=num_actions) for n, action in enumerate(sorted(self.action)): # mean = tf.Print(mean,[mean]) self.action_taken[action] = mean[n] # Advantage computation # Network outputs entries of lower triangular matrix L lower_triangular_size = num_actions * (num_actions + 1) // 2 l_entries = layers['linear'](x=self.training_network.output, size=lower_triangular_size) l_matrix = tf.exp(tf.map_fn(tf.diag, l_entries[:, :num_actions])) if num_actions > 1: offset = num_actions l_columns = list() for zeros, size in enumerate(xrange(num_actions - 1, 0, -1), 1): column = tf.pad(l_entries[:, offset: offset + size], ((0, 0), (zeros, 0))) l_columns.append(column) offset += size l_matrix += tf.stack(l_columns, 1) # P = LL^T p_matrix = tf.matmul(l_matrix, tf.transpose(l_matrix, (0, 2, 1))) # p_matrix = tf.Print(p_matrix, [p_matrix]) # l_rows = [] # offset = 0 # for i in xrange(num_actions): # # Diagonal elements are exponentiated, otherwise gradient often 0 # # Slice out lower triangular entries from flat representation through moving offset # diagonal = tf.exp(l_entries[:, offset]) # tf.slice(l_entries, (0, offset), (-1, 1)) # n = config.actions - i - 1 # # Slice out non-zero non-diagonal entries, - 1 because we already took the diagonal # non_diagonal = l_entries[:, offset + 1: offset + n + 1] # tf.slice(l_entries, (0, offset + 1), (-1, n)) # # Fill up row with zeros # row = tf.pad(tf.concat(axis=1, values=(diagonal, non_diagonal)), ((0, 0), (i, 0))) # offset += (num_actions - i) # l_rows.append(row) # # # Stack rows to matrix # l_matrix = tf.transpose(tf.stack(l_rows, axis=1), (0, 2, 1)) actions = tf.stack(values=[self.action[name] for name in sorted(self.action)], axis=1) action_diff = actions - mean # A = -0.5 (a - mean)P(a - mean) advantage = -tf.matmul(tf.expand_dims(action_diff, 1), tf.matmul(p_matrix, tf.expand_dims(action_diff, 2))) / 2 advantage = tf.squeeze(advantage, 2) # Q = A + V # State-value function value = layers['linear'](x=self.training_network.output, size=1) q_value = tf.squeeze(value + advantage, 1) training_output_vars = tf.contrib.framework.get_variables('training_outputs') with tf.variable_scope('target'): network_builder = util.get_function(fct=config.network) self.target_network = NeuralNetwork(network_builder=network_builder, inputs=self.state) self.internal_inputs.extend(self.target_network.internal_inputs) self.internal_outputs.extend(self.target_network.internal_outputs) self.internal_inits.extend(self.target_network.internal_inits) target_value = dict() with tf.variable_scope('target_outputs'): # State-value function target_value_output = layers['linear'](x=self.target_network.output, size=1) for action in self.action: # Naf directly outputs V(s) target_value[action] = target_value_output target_output_vars = tf.contrib.framework.get_variables('target_outputs') with tf.name_scope("update"): for action in self.action: q_target = self.reward[:-1] + (1.0 - tf.cast(self.terminal[:-1], tf.float32)) * config.discount * target_value[action][1:] delta = q_target - q_value[:-1] self.loss_per_instance = tf.square(delta) # We observe issues with numerical stability in some tests, gradient clipping can help if config.clip_gradients > 0.0: huber_loss = tf.where(tf.abs(delta) < config.clip_gradients, 0.5 * self.loss_per_instance, tf.abs(delta) - 0.5) loss = tf.reduce_mean(huber_loss) else: loss = tf.reduce_mean(self.loss_per_instance) tf.losses.add_loss(loss) with tf.name_scope("update_target"): # Combine hidden layer variables and output layer variables training_vars = self.training_network.variables + training_output_vars target_vars = self.target_network.variables + target_output_vars self.target_network_update = list() for v_source, v_target in zip(training_vars, target_vars): update = v_target.assign_sub(config.update_target_weight * (v_target - v_source)) self.target_network_update.append(update)