def create_tf_operations(self, config): super(DQNModel, self).create_tf_operations(config) num_actions = {name: action.num_actions for name, action in config.actions} # Training network with tf.variable_scope('training'): self.training_network = NeuralNetwork(config.network, inputs=self.state) self.internal_inputs.extend(self.training_network.internal_inputs) self.internal_outputs.extend(self.training_network.internal_outputs) self.internal_inits.extend(self.training_network.internal_inits) training_output = dict() for action in self.action: training_output[action] = layers['linear'](x=self.training_network.output, size=num_actions[action]) self.action_taken[action] = tf.argmax(training_output[action], axis=1) # Target network with tf.variable_scope('target'): self.target_network = NeuralNetwork(config.network, inputs=self.state) self.internal_inputs.extend(self.target_network.internal_inputs) self.internal_outputs.extend(self.target_network.internal_outputs) self.internal_inits.extend(self.target_network.internal_inits) target_value = dict() for action in self.action: target_output = layers['linear'](x=self.target_network.output, size=num_actions[action]) if config.double_dqn: selector = tf.one_hot(self.action_taken[action], num_actions[action]) target_value[action] = tf.reduce_sum(tf.multiply(target_output, selector), axis=1) else: target_value[action] = tf.reduce_max(target_output, axis=1) with tf.name_scope('update'): for action in self.action: # One_hot tensor of the actions that have been taken action_one_hot = tf.one_hot(self.action[action][:-1], num_actions[action]) # Training output, so we get the expected rewards given the actual states and actions q_value = tf.reduce_sum(training_output[action][:-1] * action_one_hot, axis=1) # Surrogate loss as the mean squared error between actual observed rewards and expected rewards q_target = self.reward[:-1] + (1.0 - tf.cast(self.terminal[:-1], tf.float32)) * self.discount * target_value[action][1:] delta = q_target - q_value # If gradient clipping is used, calculate the huber loss if config.clip_gradients > 0.0: huber_loss = tf.where(tf.abs(delta) < config.clip_gradients, 0.5 * tf.square(delta), tf.abs(delta) - 0.5) loss = tf.reduce_mean(huber_loss) else: loss = tf.reduce_mean(tf.square(delta)) tf.losses.add_loss(loss) # Update target network with tf.name_scope("update_target"): self.target_network_update = list() for v_source, v_target in zip(self.training_network.variables, self.target_network.variables): update = v_target.assign_sub(config.update_target_weight * (v_target - v_source)) self.target_network_update.append(update)
def create_tf_operations(self, config): super(QModel, self).create_tf_operations(config) # Placeholders with tf.variable_scope('placeholder'): self.next_state = dict() for name, state in config.states.items(): self.next_state[name] = tf.placeholder(dtype=util.tf_dtype(state.type), shape=(None,) + tuple(state.shape), name=('next_' + name)) network_builder = util.get_function(fct=config.network) # Training network with tf.variable_scope('training') as training_scope: self.training_network = NeuralNetwork(network_builder=network_builder, inputs=self.state, summary_level=config.tf_summary_level) self.network_internal_index = len(self.internal_inputs) self.internal_inputs.extend(self.training_network.internal_inputs) self.internal_outputs.extend(self.training_network.internal_outputs) self.internal_inits.extend(self.training_network.internal_inits) self.q_values = self.create_training_operations(config) self.training_variables = tf.contrib.framework.get_variables(scope=training_scope) # Target network with tf.variable_scope('target') as target_scope: self.target_network = NeuralNetwork(network_builder=network_builder, inputs=self.next_state) self.next_internal_inputs = list(self.target_network.internal_inputs) self.target_values = self.create_target_operations(config) self.target_variables = tf.contrib.framework.get_variables(scope=target_scope) with tf.name_scope('update'): deltas = self.create_q_deltas(config) # Surrogate loss as the mean squared error between actual observed rewards and expected rewards delta = tf.reduce_mean(input_tensor=tf.concat(values=deltas, axis=1), axis=1) self.loss_per_instance = tf.square(delta) # If loss clipping is used, calculate the huber loss if config.clip_loss > 0.0: huber_loss = tf.where(condition=(tf.abs(delta) < config.clip_loss), x=(0.5 * self.loss_per_instance), y=config.clip_loss * tf.abs(delta) - 0.5 * config.clip_loss ** 2) self.q_loss = tf.reduce_mean(input_tensor=huber_loss, axis=0) else: self.q_loss = tf.reduce_mean(input_tensor=self.loss_per_instance, axis=0) tf.losses.add_loss(self.q_loss) # for each loss over an action create a summary if len(self.q_loss.shape) > 1: for action_ind in range(self.q_loss.shape[1]): tf.summary.scalar('q-loss-action-{}'.format(action_ind), self.q_loss[action_ind]) else: tf.summary.scalar('q-loss', self.q_loss) # Update target network with tf.name_scope('update-target'): self.target_network_update = list() for v_source, v_target in zip(self.training_variables, self.target_variables): update = v_target.assign_sub(config.update_target_weight * (v_target - v_source)) self.target_network_update.append(update)
def create_tf_operations(self, config): if len(config.states) > 1: raise Exception() with tf.variable_scope('mlp_value_function'): self.state = tf.placeholder( dtype=tf.float32, shape=(None, util.prod(next(iter(config.states))[1].shape))) self.returns = tf.placeholder(dtype=tf.float32, shape=(None, )) network_builder = layered_network_builder(({ 'type': 'dense', 'size': self.size }, { 'type': 'dense', 'size': 1 })) network = NeuralNetwork(network_builder=network_builder, inputs=dict(state=self.state)) self.prediction = network.output loss = tf.nn.l2_loss(self.prediction - self.returns) optimizer = tf.train.AdamOptimizer( learning_rate=config.learning_rate) self.optimize = optimizer.minimize(loss)
def create_tf_operations(self, config): if len(config.states) > 1: raise Exception() with tf.variable_scope('mlp_value_function'): self.state = tf.placeholder( dtype=tf.float32, shape=(None, util.prod(next(iter(config.states))[1].shape))) self.returns = tf.placeholder(dtype=tf.float32, shape=(None, )) self.updates = int( config.batch_size / self.update_batch_size) * self.epochs self.batch_size = config.batch_size layers = [] for _ in xrange(self.hidden_layers): layers.append({'type': 'dense', 'size': self.size}) layers.append({'type': 'linear', 'size': 1}) network = NeuralNetwork( network_builder=layered_network_builder(layers), inputs=dict(state=self.state)) self.prediction = network.output loss = tf.nn.l2_loss(self.prediction - self.returns) optimizer = tf.train.AdamOptimizer( learning_rate=config.learning_rate) self.optimize = optimizer.minimize(loss)
def create_tf_operations(self, state, batch_size, scope='cnn_baseline'): with tf.variable_scope(scope): self.state = tf.placeholder(dtype=tf.float32, shape=(None, util.prod(state.shape))) self.returns = tf.placeholder(dtype=tf.float32, shape=(None,)) self.updates = int(batch_size / self.update_batch_size) * self.epochs self.batch_size = batch_size layers = [] for size in self.sizes: layers.append({'type': 'conv2d', 'size': size, 'stride': 1, 'window': 3}) # First layer has larger window layers[0]['window'] = 5 # TODO append maxpooling layers.append({'type': 'linear', 'size': 1}) network = NeuralNetwork(network_builder=layered_network_builder(layers), inputs=dict(state=self.state)) self.prediction = network.output loss = tf.nn.l2_loss(self.prediction - self.returns) optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate) self.optimize = optimizer.minimize(loss)
def create_tf_operations(self, state, scope='cnn_baseline'): with tf.variable_scope(scope) as scope: self.state = tf.placeholder(dtype=tf.float32, shape=(None,) + tuple(state.shape)) self.returns = tf.placeholder(dtype=tf.float32, shape=(None,)) layers = [] for size in self.cnn_sizes: layers.append({'type': 'conv2d', 'size': size, 'stride': 1, 'window': 3}) # First layer has larger window layers[0]['window'] = 5 layers.append({'type': 'flatten'}) for size in self.dense_sizes: layers.append({'type': 'dense', 'size': size}) layers.append({'type': 'linear', 'size': 1}) network = NeuralNetwork(network_builder=layered_network_builder(layers), inputs=dict(state=self.state)) self.prediction = tf.squeeze(input=network.output, axis=1) loss = tf.nn.l2_loss(self.prediction - self.returns) optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate) variables = tf.contrib.framework.get_variables(scope=scope) self.optimize = optimizer.minimize(loss, var_list=variables)
def create_tf_operations(self, state, batch_size, scope='mlp_baseline'): with tf.variable_scope(scope): self.state = tf.placeholder(dtype=tf.float32, shape=(None, util.prod(state.shape))) self.returns = tf.placeholder(dtype=tf.float32, shape=(None, )) self.updates = int( batch_size / self.update_batch_size) * self.epochs self.batch_size = batch_size layers = [] for size in self.sizes: layers.append({'type': 'dense', 'size': size}) layers.append({'type': 'linear', 'size': 1}) network = NeuralNetwork( network_builder=layered_network_builder(layers), inputs=dict(state=self.state)) self.prediction = network.output loss = tf.nn.l2_loss(self.prediction - self.returns) optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate) self.optimize = optimizer.minimize(loss)
def create_tf_operations(self, config): super(PolicyGradientModel, self).create_tf_operations(config) with tf.variable_scope('value_function'): network_builder = util.get_function(fct=config.network) self.network = NeuralNetwork(network_builder=network_builder, inputs=self.state) self.internal_inputs.extend(self.network.internal_inputs) self.internal_outputs.extend(self.network.internal_outputs) self.internal_inits.extend(self.network.internal_inits) with tf.variable_scope('distribution'): for action, distribution in self.distribution.items(): with tf.variable_scope(action): distribution.create_tf_operations( x=self.network.output, deterministic=self.deterministic) self.action_taken[action] = distribution.sample() if self.baseline: with tf.variable_scope('baseline'): # Generate one baseline per state input, later average their predictions for name, state in config.states: self.baseline[name].create_tf_operations( state, scope='baseline_' + name)
def create_tf_operations(self, config): """ Create tensorflow ops :return: """ super(SimpleQModel, self).create_tf_operations(config) with tf.name_scope("simpleq"): self.network = NeuralNetwork(config.network, inputs=self.state) self.network_output = layers['linear'](x=self.network.output, size=self.action_count) with tf.name_scope("predict"): self.q_action = tf.argmax(self.network_output, axis=1) with tf.name_scope("update"): # We need the Q values of the current states to calculate the difference ("loss") between the # expected values and the new values (q targets). Therefore we do a forward-pass # and reduce the results to the actions that have been taken. # One_hot tensor of the actions that have been taken. actions_one_hot = tf.one_hot(self.action['action'][:-1], self.action_count, 1.0, 0.0, name='action_one_hot') # Training output, reduced to the actions that have been taken. q_values_actions_taken = tf.reduce_sum( self.network_output[:-1] * actions_one_hot, axis=1, name='q_acted') # Expected values for the next states q_output = tf.reduce_max(self.network_output[1:], axis=1, name='q_expected') # Bellmann equation Q = r + y * Q' q_targets = self.reward[:-1] + (1. - tf.cast(self.terminal[:-1], tf.float32)) \ * self.gamma * q_output # The loss is the difference between the q_targets and the expected q values. self.loss = tf.reduce_sum( tf.square(q_targets - q_values_actions_taken)) # self.optimize_op = self.optimizer.minimize(self.loss) tf.losses.add_loss(self.loss)
def create_tf_operations(self, config): super(PolicyGradientModel, self).create_tf_operations(config) with tf.variable_scope('value_function'): self.network = NeuralNetwork(config.network, inputs=self.state) self.internal_inputs.extend(self.network.internal_inputs) self.internal_outputs.extend(self.network.internal_outputs) self.internal_inits.extend(self.network.internal_inits) with tf.variable_scope('distribution'): for action, distribution in self.distribution.items(): distribution.create_tf_operations(x=self.network.output, sample=config.sample_actions) self.action_taken[action] = distribution.value if self.baseline: with tf.variable_scope('baseline'): self.baseline.create_tf_operations(config)
def create_tf_operations(self, state, scope='mlp_baseline'): with tf.variable_scope(scope) as scope: self.state = tf.placeholder(dtype=tf.float32, shape=(None, util.prod(state.shape))) self.returns = tf.placeholder(dtype=tf.float32, shape=(None, )) layers = [] for size in self.sizes: layers.append({'type': 'dense', 'size': size}) layers.append({'type': 'linear', 'size': 1}) network = NeuralNetwork( network_builder=layered_network_builder(layers), inputs=dict(state=self.state)) self.prediction = tf.squeeze(input=network.output, axis=1) loss = tf.nn.l2_loss(self.prediction - self.returns) optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate) variables = tf.contrib.framework.get_variables(scope=scope) self.optimize = optimizer.minimize(loss, var_list=variables)
def create_tf_operations(self, config): super(CategoricalDQNModel, self).create_tf_operations(config) # Placeholders with tf.variable_scope('placeholder'): self.next_state = dict() for name, state in config.states.items(): self.next_state[name] = tf.placeholder( dtype=util.tf_dtype(state.type), shape=(None, ) + tuple(state.shape), name=name) # setup constants delta_z and z. z represents the discretized scaling over vmin -> vmax scaling_increment = (self.distribution_max - self.distribution_min) / ( self.num_atoms - 1) # delta_z in the paper quantized_steps = self.distribution_min + np.arange( self.num_atoms) * scaling_increment # z in the paper num_actions = { name: action.num_actions for name, action in config.actions } # creating networks network_builder = util.get_function(fct=config.network) # Training network with tf.variable_scope('training') as training_scope: self.training_network = NeuralNetwork( network_builder=network_builder, inputs=self.state, summary_level=config.tf_summary_level) self.network_internal_index = len(self.internal_inputs) self.internal_inputs.extend(self.training_network.internal_inputs) self.internal_outputs.extend( self.training_network.internal_outputs) self.internal_inits.extend(self.training_network.internal_inits) training_output_logits, training_output_probabilities, training_qval, action_taken = self._create_action_outputs( self.training_network.output, quantized_steps, self.num_atoms, config, self.action, num_actions) # stack to preserve action_taken shape like (batch_size, num_actions) for action in self.action: if len(action_taken[action]) > 1: self.action_taken[action] = tf.stack(action_taken[action], axis=1) else: self.action_taken[action] = action_taken[action][0] # summarize expected reward histogram if config.tf_summary_level >= 1: for action_shaped in range(len(action_taken[action])): for action_ind in range(num_actions[action]): tf.summary.histogram( '{}-{}-{}-output-distribution'.format( action, action_shaped, action_ind), training_output_probabilities[action] [action_shaped][:, action_ind] * quantized_steps) self.training_variables = tf.contrib.framework.get_variables( scope=training_scope) # Target network with tf.variable_scope('target') as target_scope: self.target_network = NeuralNetwork( network_builder=network_builder, inputs=self.next_state) self.next_internal_inputs = list( self.target_network.internal_inputs) _, target_output_probabilities, target_qval, target_action = self._create_action_outputs( self.target_network.output, quantized_steps, self.num_atoms, config, self.action, num_actions) self.target_variables = tf.contrib.framework.get_variables( scope=target_scope) with tf.name_scope('update'): # broadcast rewards and discounted quantization. Shape (batchsize, num_atoms). T_z_j in the paper reward = tf.expand_dims(self.reward, axis=1) terminal = tf.expand_dims(tf.cast(x=self.terminal, dtype=tf.float32), axis=1) broadcasted_rewards = reward + (1.0 - terminal) * ( quantized_steps * self.discount) # clip into distribution_min, distribution_max quantized_discounted_reward = tf.clip_by_value( broadcasted_rewards, self.distribution_min, self.distribution_max) # compute quantization indecies. b, l, u in the paper closest_quantization = (quantized_discounted_reward - self.distribution_min) / scaling_increment lower_ind = tf.floor(closest_quantization) upper_ind = tf.ceil(closest_quantization) # create shared selections for later use dynamic_batch_size = tf.shape(self.reward)[0] batch_selection = tf.range(0, dynamic_batch_size) # tile expects a tensor of same shape, we are just repeating the selection num_atoms times across the last dimension batch_tiled_selection = tf.reshape( tf.tile(tf.reshape(batch_selection, (-1, 1)), [1, self.num_atoms]), [-1]) # combine with lower and upper ind, same as zip(flatten(batch_tiled_selection), flatten(lower_ind)) # also cast to int32 to use as index batch_lower_inds = tf.stack( (batch_tiled_selection, tf.reshape(tf.cast(lower_ind, tf.int32), [-1])), axis=1) batch_upper_inds = tf.stack( (batch_tiled_selection, tf.reshape(tf.cast(upper_ind, tf.int32), [-1])), axis=1) # create loss for each action for action in self.action: # if shape of action != () we need to process each action head separately for action_ind in range( max([util.prod(config.actions[action].shape), 1])): # project onto the supports # tensorflow indexing is still not great, we stack these two and use gather_nd later target_batch_action_selection = tf.stack( (batch_selection, target_action[action][action_ind]), axis=1) # distribute probability scaled by distance # in numpy the equivalent is target_output_probabilities[action][batch_selection, target_action] target_probabilities_of_action = tf.gather_nd( target_output_probabilities[action][action_ind], target_batch_action_selection) distance_lower = target_probabilities_of_action * ( closest_quantization - lower_ind) distance_upper = target_probabilities_of_action * ( upper_ind - closest_quantization) # sum distances aligned into quantized bins. m in the paper # scatter_nd actually sums the values into a zeros tensor instead of overwriting # this is pretty much a huge hack refer to https://github.com/tensorflow/tensorflow/issues/8102 target_quantized_probabilities_lower = tf.scatter_nd( batch_lower_inds, tf.reshape(distance_lower, [-1]), (dynamic_batch_size, self.num_atoms)) target_quantized_probabilities_upper = tf.scatter_nd( batch_upper_inds, tf.reshape(distance_upper, [-1]), (dynamic_batch_size, self.num_atoms)) # no gradient should flow back to the target network target_quantized_probabilities = tf.stop_gradient( target_quantized_probabilities_lower + target_quantized_probabilities_upper) # we must check if input action has shape if len(self.action[action].shape) > 1: input_action = self.action[action][:, action_ind] else: input_action = self.action[action] # now we have target probabilities loss is categorical cross entropy using logits # compare to the actions we actually took training_action_selection = tf.stack( (batch_selection, input_action), axis=1) probabilities_for_action = tf.gather_nd( training_output_probabilities[action][action_ind], training_action_selection) self.loss_per_instance = -tf.reduce_sum( target_quantized_probabilities * tf.log(probabilities_for_action + util.epsilon), axis=-1) loss = tf.reduce_mean(self.loss_per_instance) tf.losses.add_loss(loss) tf.summary.scalar( 'cce-loss-{}-{}'.format(action, action_ind), loss) # Update target network with tf.name_scope("update_target"): self.target_network_update = list() for v_source, v_target in zip(self.training_variables, self.target_variables): update = v_target.assign_sub(config.update_target_weight * (v_target - v_source)) self.target_network_update.append(update)
def create_tf_operations(self, config): super(NAFModel, self).create_tf_operations(config) num_actions = sum( util.prod(config.actions[name].shape) for name in sorted(self.action)) # Get hidden layers from network generator, then add NAF outputs, same for target network with tf.variable_scope('training'): network_builder = util.get_function(fct=config.network) self.training_network = NeuralNetwork( network_builder=network_builder, inputs=self.state) self.internal_inputs.extend(self.training_network.internal_inputs) self.internal_outputs.extend( self.training_network.internal_outputs) self.internal_inits.extend(self.training_network.internal_inits) with tf.variable_scope('training_outputs') as scope: # Action outputs flat_mean = layers['linear'](x=self.training_network.output, size=num_actions) n = 0 for name in sorted(self.action): shape = config.actions[name].shape self.action_taken[name] = tf.reshape( tensor=flat_mean[:, n:n + util.prod(shape)], shape=((-1, ) + shape)) n += util.prod(shape) # Advantage computation # Network outputs entries of lower triangular matrix L lower_triangular_size = num_actions * (num_actions + 1) // 2 l_entries = layers['linear'](x=self.training_network.output, size=lower_triangular_size) l_matrix = tf.exp( x=tf.map_fn(fn=tf.diag, elems=l_entries[:, :num_actions])) if num_actions > 1: offset = num_actions l_columns = list() for zeros, size in enumerate(xrange(num_actions - 1, -1, -1), 1): column = tf.pad(tensor=l_entries[:, offset:offset + size], paddings=((0, 0), (zeros, 0))) l_columns.append(column) offset += size l_matrix += tf.stack(values=l_columns, axis=1) # P = LL^T p_matrix = tf.matmul(a=l_matrix, b=tf.transpose(a=l_matrix, perm=(0, 2, 1))) flat_action = list() for name in sorted(self.action): shape = config.actions[name].shape flat_action.append( tf.reshape(tensor=self.action[name], shape=(-1, util.prod(shape)))) flat_action = tf.concat(values=flat_action, axis=1) difference = flat_action - flat_mean # A = -0.5 (a - mean)P(a - mean) advantage = tf.matmul(a=p_matrix, b=tf.expand_dims(input=difference, axis=2)) advantage = tf.matmul(a=tf.expand_dims(input=difference, axis=1), b=advantage) advantage = tf.squeeze(input=(-advantage / 2.0), axis=2) # Q = A + V # State-value function value = layers['linear'](x=self.training_network.output, size=num_actions) q_value = value + advantage training_output_vars = tf.contrib.framework.get_variables( scope=scope) with tf.variable_scope('target'): network_builder = util.get_function(fct=config.network) self.target_network = NeuralNetwork( network_builder=network_builder, inputs=self.state) self.internal_inputs.extend(self.target_network.internal_inputs) self.internal_outputs.extend(self.target_network.internal_outputs) self.internal_inits.extend(self.target_network.internal_inits) with tf.variable_scope('target_outputs') as scope: # State-value function target_value = layers['linear'](x=self.target_network.output, size=num_actions) target_output_vars = tf.contrib.framework.get_variables( scope=scope) with tf.name_scope('update'): reward = tf.expand_dims(input=self.reward[:-1], axis=1) terminal = tf.expand_dims(input=tf.cast(x=self.terminal[:-1], dtype=tf.float32), axis=1) q_target = reward + (1.0 - terminal) * config.discount * target_value[1:] delta = q_target - q_value[:-1] delta = tf.reduce_mean(input_tensor=delta, axis=1) self.loss_per_instance = tf.square(x=delta) # We observe issues with numerical stability in some tests, gradient clipping can help if config.clip_gradients > 0.0: huber_loss = tf.where( condition=(tf.abs(delta) < config.clip_gradients), x=(0.5 * self.loss_per_instance), y=(tf.abs(delta) - 0.5)) loss = tf.reduce_mean(input_tensor=huber_loss, axis=0) else: loss = tf.reduce_mean(input_tensor=self.loss_per_instance, axis=0) tf.losses.add_loss(loss) with tf.name_scope('update_target'): # Combine hidden layer variables and output layer variables training_vars = self.training_network.variables + training_output_vars target_vars = self.target_network.variables + target_output_vars self.target_network_update = list() for v_source, v_target in zip(training_vars, target_vars): update = v_target.assign_sub(config.update_target_weight * (v_target - v_source)) self.target_network_update.append(update)
def create_tf_operations(self, config): super(DQNModel, self).create_tf_operations(config) flat_action_sizes = { name: util.prod(action.shape) * action.num_actions for name, action in config.actions } action_shapes = { name: (-1, ) + action.shape + (action.num_actions, ) for name, action in config.actions } # Training network with tf.variable_scope('training'): network_builder = util.get_function(fct=config.network) self.training_network = NeuralNetwork( network_builder=network_builder, inputs=self.state) self.internal_inputs.extend(self.training_network.internal_inputs) self.internal_outputs.extend( self.training_network.internal_outputs) self.internal_inits.extend(self.training_network.internal_inits) self.training_output = dict() for action in self.action: output = layers['linear'](x=self.training_network.output, size=flat_action_sizes[action]) self.training_output[action] = tf.reshape( tensor=output, shape=action_shapes[action]) self.action_taken[action] = tf.argmax( self.training_output[action], axis=-1) # Target network with tf.variable_scope('target'): network_builder = util.get_function(fct=config.network) self.target_network = NeuralNetwork( network_builder=network_builder, inputs=self.state) self.internal_inputs.extend(self.target_network.internal_inputs) self.internal_outputs.extend(self.target_network.internal_outputs) self.internal_inits.extend(self.target_network.internal_inits) target_value = dict() for action in self.action: output = layers['linear'](x=self.target_network.output, size=flat_action_sizes[action]) output = tf.reshape(tensor=output, shape=action_shapes[action]) if config.double_dqn: selector = tf.one_hot(indices=self.action_taken[action], depth=action_shapes[action][1]) target_value[action] = tf.reduce_sum( input_tensor=(output * selector), axis=-1) else: target_value[action] = tf.reduce_max(input_tensor=output, axis=-1) with tf.name_scope('update'): self.actions_one_hot = dict() self.q_values = dict() deltas = list() for action in self.action: # One_hot tensor of the actions that have been taken self.actions_one_hot[action] = tf.one_hot( indices=self.action[action][:-1], depth=config.actions[action].num_actions) # Training output, so we get the expected rewards given the actual states and actions self.q_values[action] = tf.reduce_sum( input_tensor=(self.training_output[action][:-1] * self.actions_one_hot[action]), axis=-1) reward = self.reward[:-1] terminal = tf.cast(x=self.terminal[:-1], dtype=tf.float32) for _ in range(len(config.actions[action].shape)): reward = tf.expand_dims(input=reward, axis=1) terminal = tf.expand_dims(input=terminal, axis=1) # Surrogate loss as the mean squared error between actual observed rewards and expected rewards q_target = reward + ( 1.0 - terminal) * config.discount * target_value[action][1:] delta = q_target - self.q_values[action] ds_list = [delta] for _ in range(len(config.actions[action].shape)): ds_list = [ d for ds in ds_list for d in tf.unstack(value=ds, axis=1) ] deltas.extend(ds_list) delta = tf.add_n(inputs=deltas) / len(deltas) self.loss_per_instance = tf.square(delta) # If gradient clipping is used, calculate the huber loss if config.clip_loss > 0.0: huber_loss = tf.where( condition=(tf.abs(delta) < config.clip_gradients), x=(0.5 * self.loss_per_instance), y=(tf.abs(delta) - 0.5)) loss = tf.reduce_mean(input_tensor=huber_loss, axis=0) else: loss = tf.reduce_mean(input_tensor=self.loss_per_instance, axis=0) self.dqn_loss = loss tf.losses.add_loss(loss) # Update target network with tf.name_scope('update_target'): self.target_network_update = list() for v_source, v_target in zip(self.training_network.variables, self.target_network.variables): update = v_target.assign_sub(config.update_target_weight * (v_target - v_source)) self.target_network_update.append(update)
def create_tf_operations(self, config): """Create training graph. For DQFD, we build the double-dqn training graph and modify the double_q_loss function according to eq. 5 Args: config: Config dict. Returns: """ super(DQFDModel, self).create_tf_operations(config) num_actions = { name: action.num_actions for name, action in config.actions } # placeholders with tf.variable_scope('placeholders'): self.q_targets = tf.placeholder(tf.float32, (None, ), name='q_targets') # Training network with tf.variable_scope('training'): self.training_network = NeuralNetwork( config.network, inputs={name: state for name, state in self.state.items()}) self.internal_inputs.extend(self.training_network.internal_inputs) self.internal_outputs.extend( self.training_network.internal_outputs) self.internal_inits.extend(self.training_network.internal_inits) training_output = dict() for action in self.action: training_output[action] = layers['linear']( x=self.training_network.output, size=num_actions[action]) self.action_taken[action] = tf.argmax(training_output[action], axis=1) # Target network with tf.variable_scope('target'): self.target_network = NeuralNetwork( config.network, inputs={name: state for name, state in self.state.items()}) self.internal_inputs.extend(self.target_network.internal_inputs) self.internal_outputs.extend(self.target_network.internal_outputs) self.internal_inits.extend(self.target_network.internal_inits) target_value = dict() for action in self.action: target_output = layers['linear'](x=self.target_network.output, size=num_actions[action]) selector = tf.one_hot(self.action_taken[action], num_actions[action]) target_value[action] = tf.reduce_sum(tf.multiply( target_output, selector), axis=1) with tf.name_scope("update"): self.dqfd_opt = [] for action in self.action: # Self.q_targets gets fed the actual observed rewards and expected future rewards # One_hot tensor of the actions that have been taken action_one_hot = tf.one_hot(self.action[action][:-1], num_actions[action]) # Training output, so we get the expected rewards given the actual states and actions q_value = tf.reduce_sum(training_output[action][:-1] * action_one_hot, axis=1) # Surrogate loss as the mean squared error between actual observed rewards and expected rewards q_target = self.reward[:-1] + ( 1.0 - tf.cast(self.terminal[:-1], tf.float32) ) * self.discount * target_value[action][1:] delta = q_target - q_value self.loss_per_instance = tf.square(delta) # If gradient clipping is used, calculate the huber loss if config.clip_gradients > 0.0: huber_loss = tf.where( tf.abs(delta) < config.clip_gradients, 0.5 * self.loss_per_instance, tf.abs(delta) - 0.5) double_q_loss = tf.reduce_mean(huber_loss) else: double_q_loss = tf.reduce_mean(self.loss_per_instance) # Use the existing loss structure from the model here, then compute dqfd loss separately tf.losses.add_loss(double_q_loss) # Create the supervised margin loss mask = tf.ones_like(action_one_hot, dtype=tf.float32) # Zero for the action taken, one for all other actions, now multiply by expert margin inverted_one_hot = mask - action_one_hot # max_a([Q(s,a) + l(s,a_E,a)], l(s,a_E, a) is 0 for expert action and margin value for others expert_margin = training_output[action][:-1] + tf.multiply( inverted_one_hot, config.expert_margin) supervised_selector = tf.reduce_max( expert_margin, axis=1, name='expert_margin_selector') # J_E(Q) = max_a([Q(s,a) + l(s,a_E,a)] - Q(s,a_E) supervised_loss = supervised_selector - q_value # Combining double q loss with supervised loss dqfd_loss = double_q_loss + tf.multiply( tf.reduce_mean(supervised_loss), config.supervised_weight) # This decomposition is not necessary, we just want to be able to export gradients dqfd_grads_and_vars = self.optimizer.compute_gradients( dqfd_loss) self.dqfd_opt.append( self.optimizer.apply_gradients(dqfd_grads_and_vars)) # Update target network according to update weight self.target_network_update = [] with tf.name_scope("update_target"): for v_source, v_target in zip(self.training_network.variables, self.target_network.variables): update = v_target.assign_sub(config.update_target_weight * (v_target - v_source)) self.target_network_update.append(update)
def create_tf_operations(self, config): super(NAFModel, self).create_tf_operations(config) # Get hidden layers from network generator, then add NAF outputs, same for target network with tf.variable_scope('training'): self.training_network = NeuralNetwork(config.network, inputs=self.state) self.internal_inputs.extend(self.training_network.internal_inputs) self.internal_outputs.extend( self.training_network.internal_outputs) self.internal_inits.extend(self.training_network.internal_inits) with tf.variable_scope('training_outputs'): num_actions = len(self.action) # Action outputs mean = layers['linear'](x=self.training_network.output, size=num_actions) for n, action in enumerate(sorted(self.action)): # mean = tf.Print(mean,[mean]) self.action_taken[action] = mean[n] # Advantage computation # Network outputs entries of lower triangular matrix L lower_triangular_size = num_actions * (num_actions + 1) // 2 l_entries = layers['linear'](x=self.training_network.output, size=lower_triangular_size) l_matrix = tf.exp(tf.map_fn(tf.diag, l_entries[:, :num_actions])) if num_actions > 1: offset = num_actions l_columns = list() for zeros, size in enumerate(xrange(num_actions - 1, 0, -1), 1): column = tf.pad(l_entries[:, offset:offset + size], ((0, 0), (zeros, 0))) l_columns.append(column) offset += size l_matrix += tf.stack(l_columns, 1) # P = LL^T p_matrix = tf.matmul(l_matrix, tf.transpose(l_matrix, (0, 2, 1))) # p_matrix = tf.Print(p_matrix, [p_matrix]) # l_rows = [] # offset = 0 # for i in xrange(num_actions): # # Diagonal elements are exponentiated, otherwise gradient often 0 # # Slice out lower triangular entries from flat representation through moving offset # diagonal = tf.exp(l_entries[:, offset]) # tf.slice(l_entries, (0, offset), (-1, 1)) # n = config.actions - i - 1 # # Slice out non-zero non-diagonal entries, - 1 because we already took the diagonal # non_diagonal = l_entries[:, offset + 1: offset + n + 1] # tf.slice(l_entries, (0, offset + 1), (-1, n)) # # Fill up row with zeros # row = tf.pad(tf.concat(axis=1, values=(diagonal, non_diagonal)), ((0, 0), (i, 0))) # offset += (num_actions - i) # l_rows.append(row) # # # Stack rows to matrix # l_matrix = tf.transpose(tf.stack(l_rows, axis=1), (0, 2, 1)) actions = tf.stack( values=[self.action[name] for name in sorted(self.action)], axis=1) action_diff = actions - mean # A = -0.5 (a - mean)P(a - mean) advantage = -tf.matmul( tf.expand_dims(action_diff, 1), tf.matmul(p_matrix, tf.expand_dims(action_diff, 2))) / 2 advantage = tf.squeeze(advantage, 2) # Q = A + V # State-value function value = layers['linear'](x=self.training_network.output, size=1) q_value = tf.squeeze(value + advantage, 1) training_output_vars = get_variables('training_outputs') with tf.variable_scope('target'): self.target_network = NeuralNetwork(config.network, inputs=self.state) self.internal_inputs.extend(self.target_network.internal_inputs) self.internal_outputs.extend(self.target_network.internal_outputs) self.internal_inits.extend(self.target_network.internal_inits) target_value = dict() with tf.variable_scope('target_outputs'): # State-value function target_value_output = layers['linear']( x=self.target_network.output, size=1) for action in self.action: # Naf directly outputs V(s) target_value[action] = target_value_output target_output_vars = get_variables('target_outputs') with tf.name_scope("update"): for action in self.action: q_target = self.reward[:-1] + (1.0 - tf.cast(self.terminal[:-1], tf.float32)) * config.discount\ * target_value[action][1:] delta = q_target - q_value[:-1] # We observe issues with numerical stability in some tests, gradient clipping can help if config.clip_gradients > 0.0: huber_loss = tf.where( tf.abs(delta) < config.clip_gradients, tf.multiply(tf.square(delta), 0.5), tf.abs(delta) - 0.5) loss = tf.reduce_mean(huber_loss) else: loss = tf.reduce_mean(tf.square(delta)) # loss = tf.Print(loss, [loss]) tf.losses.add_loss(loss) with tf.name_scope("update_target"): # Combine hidden layer variables and output layer variables training_vars = self.training_network.variables + training_output_vars target_vars = self.target_network.variables + target_output_vars self.target_network_update = list() for v_source, v_target in zip(training_vars, target_vars): update = v_target.assign_sub(config.update_target_weight * (v_target - v_source)) self.target_network_update.append(update)
def create_tf_operations(self, config): super(QModel, self).create_tf_operations(config) # Placeholders with tf.variable_scope('placeholder'): self.next_state = dict() for name, state in config.states.items(): self.next_state[name] = tf.placeholder( dtype=util.tf_dtype(state.type), shape=(None, ) + tuple(state.shape), name=name) network_builder = util.get_function(fct=config.network) # Training network with tf.variable_scope('training') as training_scope: self.training_network = NeuralNetwork( network_builder=network_builder, inputs=self.state) self.internal_inputs.extend(self.training_network.internal_inputs) self.internal_outputs.extend( self.training_network.internal_outputs) self.internal_inits.extend(self.training_network.internal_inits) self.q_values = self.create_training_operations(config) self.training_variables = tf.contrib.framework.get_variables( scope=training_scope) # Target network with tf.variable_scope('target') as target_scope: self.target_network = NeuralNetwork( network_builder=network_builder, inputs=self.next_state) self.internal_inputs.extend(self.target_network.internal_inputs) self.internal_outputs.extend(self.target_network.internal_outputs) self.internal_inits.extend(self.target_network.internal_inits) self.target_values = self.create_target_operations(config) self.target_variables = tf.contrib.framework.get_variables( scope=target_scope) with tf.name_scope('update'): deltas = list() terminal_float = tf.cast(x=self.terminal, dtype=tf.float32) for name, action in self.action.items(): reward = self.reward terminal = terminal_float for _ in range(len(config.actions[name].shape)): reward = tf.expand_dims(input=reward, axis=1) terminal = tf.expand_dims(input=terminal, axis=1) q_target = reward + ( 1.0 - terminal) * config.discount * self.target_values[name] delta = tf.stop_gradient(q_target) - self.q_values[name] delta = tf.reshape( tensor=delta, shape=(-1, util.prod(config.actions[name].shape))) deltas.append(delta) # Surrogate loss as the mean squared error between actual observed rewards and expected rewards delta = tf.reduce_mean(input_tensor=tf.concat(values=deltas, axis=1), axis=1) self.loss_per_instance = tf.square(delta) # If loss clipping is used, calculate the huber loss if config.clip_loss > 0.0: huber_loss = tf.where( condition=(tf.abs(delta) < config.clip_gradients), x=(0.5 * self.loss_per_instance), y=(tf.abs(delta) - 0.5)) self.q_loss = tf.reduce_mean(input_tensor=huber_loss, axis=0) else: self.q_loss = tf.reduce_mean( input_tensor=self.loss_per_instance, axis=0) tf.losses.add_loss(self.q_loss) # Update target network with tf.name_scope('update-target'): self.target_network_update = list() for v_source, v_target in zip(self.training_variables, self.target_variables): update = v_target.assign_sub(config.update_target_weight * (v_target - v_source)) self.target_network_update.append(update)