def test_summarizer(self): # Bulk Tests with tf.Graph().as_default(): x = tf.placeholder("float", [None, 4]) W = tf.Variable(tf.random_normal([4, 4])) x = tf.nn.tanh(tf.matmul(x, W)) tf.add_to_collection(tf.GraphKeys.ACTIVATIONS, x) import tflearn.helpers.summarizer as s s.summarize_variables([W]) s.summarize_activations(tf.get_collection(tf.GraphKeys.ACTIVATIONS)) s.summarize(x, 'histogram', "test_summary")
def test_summarizer(self): # Bulk Tests with tf.Graph().as_default(): x = tf.placeholder("float", [None, 4]) W = tf.Variable(tf.random_normal([4, 4])) x = tf.nn.tanh(tf.matmul(x, W)) tf.add_to_collection(tf.GraphKeys.ACTIVATIONS, x) import tflearn.helpers.summarizer as s s.summarize_variables([W]) s.summarize_activations(tf.get_collection( tf.GraphKeys.ACTIVATIONS)) s.summarize(x, 'histogram', "test_summary")
def create_network_graph(self): input_shape = self._input_shape output_num = self._output_num # Input placeholders with tf.name_scope('input'): # we need to fix the input shape from (batch, filter, height, width) to # tensorflow which is (batch, height, width, filter) x_input_channel_firstdim = tf.placeholder(tf.uint8, [None] + input_shape, name='x-input') # transpose because tf wants channels on last dim and channels are passed in on 2nd dim x_input = tf.cast(tf.transpose(x_input_channel_firstdim, perm=[0, 2, 3, 1]), tf.float32) / 255.0 # transpose because tf wants channels on last dim and channels are passed in on 2nd dim x_actions = tf.placeholder(tf.int32, shape=[None], name='x-actions') x_rewards = tf.placeholder(tf.float32, shape=[None], name='x-rewards') with tf.variable_scope('network'): actor_output, critic_output = self._network_generator(x_input, output_num) # flatten the critic_output NOTE: THIS IS VERY IMPORTANT # otherwise critic_output will be (batch_size, 1) and all ops with it and x_rewards will create a # tensor of shape (batch_size, batch_size) critic_output = tf.reshape(critic_output, [-1]) # # summarize a histogram of each action output # for output_ind in range(output_num): # summarizer.summarize(actor_output[:, output_ind], 'histogram', 'network-actor-output/{0}'.format(output_ind)) # # summarize critic output # summarizer.summarize(tf.reduce_mean(critic_output), 'scalar', 'network-critic-output') # # get the trainable variables for this network, later used to overwrite target network vars network_trainables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='network') # # summarize activations # summarizer.summarize_activations(tf.get_collection(tf.GraphKeys.ACTIVATIONS, scope='network')) # # add network summaries # summarizer.summarize_variables(train_vars=network_trainables) # caclulate losses with tf.name_scope('loss'): with tf.name_scope('critic-reward-diff'): critic_diff = tf.subtract(critic_output, x_rewards) with tf.name_scope('log-of-actor-policy'): # Because of https://github.com/tensorflow/tensorflow/issues/206 # we cannot use numpy like indexing so we convert to a one hot # multiply then take the max over last dim # NumPy/Theano est_rew = network_output[:, x_actions] x_actions_one_hot = tf.one_hot(x_actions, depth=output_num, name='one-hot', on_value=1.0, off_value=0.0, dtype=tf.float32) # we reduce sum here because the output could be negative we can't take the max # the other indecies will be 0 log_policy = tf.log(actor_output + 1e-6) log_policy_one_hot = tf.multiply(log_policy, x_actions_one_hot) log_policy_action = tf.reduce_sum(log_policy_one_hot, axis=1) with tf.name_scope('actor-entropy'): actor_entropy = tf.reduce_sum(tf.multiply(actor_output, log_policy)) summarizer.summarize(actor_entropy, 'scalar', 'actor-entropy') with tf.name_scope('actor-loss'): actor_loss = tf.reduce_sum(tf.multiply(log_policy_action, tf.stop_gradient(critic_diff))) summarizer.summarize(actor_loss, 'scalar', 'actor-loss') with tf.name_scope('critic-loss'): critic_loss = tf.nn.l2_loss(critic_diff) * 0.5 summarizer.summarize(critic_loss, 'scalar', 'critic-loss') with tf.name_scope('total-loss'): # NOTICE: we are summing gradients # NOTE: we are maximizing entropy # We want the network to not be sure of it's actions (entropy is highest with outputs not at 0 or 1) # https://www.wolframalpha.com/input/?i=log(x)+*+x total_loss = tf.reduce_sum(critic_loss + actor_loss + (actor_entropy * self._entropy_regularization)) summarizer.summarize(total_loss, 'scalar', 'total-loss') # optimizer with tf.name_scope('shared-optimizer'): tf_learning_rate = tf.placeholder(tf.float32) optimizer = self._optimizer_fn(learning_rate=tf_learning_rate) # only train the network vars with tf.name_scope('compute-clip-grads'): gradients = optimizer.compute_gradients(total_loss) # gradients are stored as a tuple, (gradient, tensor the gradient corresponds to) # kinda lame that clip by global norm doesn't accept the list of tuples returned from compute_gradients # so we unzip then zip tensors = [tensor for gradient, tensor in gradients] grads = [gradient for gradient, tensor in gradients] clipped_gradients, _ = tf.clip_by_global_norm(grads, self.global_norm_clipping) # returns list[tensors], norm clipped_grads_tensors = zip(clipped_gradients, tensors) tf_train_step = optimizer.apply_gradients(clipped_grads_tensors) # tflearn smartly knows how gradients are stored so we just pass in the list of tuples # summarizer.summarize_gradients(clipped_grads_tensors) # tf learn auto merges all summaries so we just have to grab the last one tf_summaries = summarizer.summarize(tf_learning_rate, 'scalar', 'learning-rate') # function to get network output def get_output(sess, state): feed_dict = {x_input_channel_firstdim: state} actor_out_values = sess.run(actor_output, feed_dict=feed_dict)[0] if self.deterministic: return np.argmax(actor_out_values) else: return get_action_from_probabilities(actor_out_values) # function to train network def train_step(sess, states, actions, rewards, states_tp1, terminals, global_step=0, summaries=False): self.anneal_learning_rate(global_step) # nstep calculate TD reward if sum(terminals) > 1: raise ValueError('TD reward for mutiple terminal states in a batch is undefined') # last state not terminal need to query target network curr_reward = 0 if not terminals[-1]: target_feed_dict = {x_input_channel_firstdim: [states_tp1[-1]]} # make a list to add back the first dim (needs to be 4 dims) curr_reward = max(sess.run(critic_output, feed_dict=target_feed_dict)) # get bootstrap estimate of last state_tp1 td_rewards = [] for reward in reversed(rewards): curr_reward = reward + self._q_discount * curr_reward td_rewards.append(curr_reward) # td rewards is computed backward but other lists are stored forward so need to reverse td_rewards = list(reversed(td_rewards)) feed_dict = {x_input_channel_firstdim: states, x_actions: actions, x_rewards: td_rewards, tf_learning_rate: self.current_learning_rate} if summaries: return sess.run([tf_summaries, tf_train_step], feed_dict=feed_dict)[0] else: return sess.run([tf_train_step], feed_dict=feed_dict) self._get_output = get_output self._train_step = train_step self._save_variables = network_trainables
def create_network_graph(self): input_shape = self._input_shape output_num = self._output_num # Input placeholders with tf.name_scope('input'): # we need to fix the input shape from (batch, filter, height, width) to # tensorflow which is (batch, height, width, filter) self._t_x_input_channel_firstdim = tf.placeholder(tf.uint8, [None] + input_shape, name='x-input') # transpose because tf wants channels on last dim and channels are passed in on 2nd dim self._t_x_input = tf.cast(tf.transpose(self._t_x_input_channel_firstdim, perm=[0, 2, 3, 1]), tf.float32) / 255.0 self._t_x_input_tp1_channel_firstdim = tf.placeholder(tf.uint8, [None] + input_shape, name='x-input-tp1') # transpose because tf wants channels on last dim and channels are passed in on 2nd dim self._t_x_input_tp1 = tf.cast(tf.transpose(self._t_x_input_tp1_channel_firstdim, perm=[0, 2, 3, 1]), tf.float32) / 255.0 self._t_x_actions = tf.placeholder(tf.int32, shape=[None], name='x-actions') self._t_x_rewards = tf.placeholder(tf.float32, shape=[None], name='x-rewards') self._t_x_terminals = tf.placeholder(tf.bool, shape=[None], name='x-terminals') self._t_x_discount = self._q_discount # Target network does not reuse variables with tf.variable_scope('network') as var_scope: self._t_network_output = self._network_generator(self._t_x_input, output_num) # get the trainable variables for this network, later used to overwrite target network vars self._tf_network_trainables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='network') # summarize activations summarizer.summarize_activations(tf.get_collection(tf.GraphKeys.ACTIVATIONS, scope='network')) # if double DQN then we need to create network output for s_tp1 if self.algorithm_type == 'double' or self.algorithm_type == 'doublenstep': var_scope.reuse_variables() self._t_network_output_tp1 = self._network_generator(self._t_x_input_tp1, output_num) # summarize a histogram of each action output for output_ind in range(output_num): summarizer.summarize(self._t_network_output[:, output_ind], 'histogram', 'network-output/{0}'.format(output_ind)) # add network summaries summarizer.summarize_variables(train_vars=self._tf_network_trainables) with tf.variable_scope('target-network'): self._t_target_network_output = self._network_generator(self._t_x_input_tp1, output_num) # get trainables for target network, used in assign op for the update target network step target_network_trainables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='target-network') # update target network with network variables with tf.name_scope('update-target-network'): self._tf_update_target_network_ops = [target_v.assign(v) for v, target_v in zip(self._tf_network_trainables, target_network_trainables)] # if double convience function to get target values for online action if self.algorithm_type == 'double' or self.algorithm_type == 'doublenstep': with tf.name_scope('double_target'): # Target = target_Q(s_tp1, argmax(online_Q(s_tp1))) argmax_tp1 = tf.argmax(self._t_network_output_tp1, axis=1) self._t_target_value_online_action = tf_util.one_hot(self._t_target_network_output, argmax_tp1, output_num) # caclulate QLoss with tf.name_scope('loss'): # nstep rewards are calculated outside the gpu/graph because it requires a loop if self.algorithm_type != 'nstep' and self.algorithm_type != 'doublenstep': with tf.name_scope('estimated-reward-tp1'): if self.algorithm_type == 'double': # Target = target_Q(s_tp1, argmax(online_Q(s_tp1))) target = self._t_target_value_online_action elif self.algorithm_type == 'dqn': # Target = max(target_Q(s_tp1)) target = tf.reduce_max(self._t_target_network_output, axis=1) # compute a mask that returns gamma (discount factor) or 0 if terminal terminal_discount_mask = tf.multiply(1.0 - tf.cast(self._t_x_terminals, tf.float32), self._t_x_discount) est_rew_tp1 = tf.multiply(terminal_discount_mask, target) y = self._t_x_rewards + tf.stop_gradient(est_rew_tp1) # else nstep else: y = self._t_x_rewards with tf.name_scope('estimated-reward'): est_rew = tf_util.one_hot(self._t_network_output, self._t_x_actions, output_num) with tf.name_scope('qloss'): # clip loss but keep linear past clip bounds (huber loss with customizable linear part) # REFS: https://github.com/spragunr/deep_q_rl/blob/master/deep_q_rl/q_network.py#L108 # https://github.com/Jabberwockyll/deep_rl_ale/blob/master/q_network.py#L241 diff = y - est_rew if self._loss_clipping > 0.0: abs_diff = tf.abs(diff) # same as min(diff, loss_clipping) because diff can never be negative (definition of abs value) quadratic_part = tf.clip_by_value(abs_diff, 0.0, self._loss_clipping) linear_part = abs_diff - quadratic_part loss = (0.5 * tf.square(quadratic_part)) + (self._loss_clipping * linear_part) else: # But why multiply the loss by 0.5 when not clipping? https://groups.google.com/forum/#!topic/deep-q-learning/hKK0ZM_OWd4 loss = 0.5 * tf.square(diff) # NOTICE: we are summing gradients error = tf.reduce_sum(loss) summarizer.summarize(error, 'scalar', 'loss') # optimizer with tf.name_scope('shared-optimizer'): self._tf_learning_rate = tf.placeholder(tf.float32) optimizer = self._optimizer_fn(learning_rate=self._tf_learning_rate) # only train the network vars not the target network gradients = optimizer.compute_gradients(error, var_list=self._tf_network_trainables) # gradients are stored as a tuple, (gradient, tensor the gradient corresponds to) # kinda lame that clip by global norm doesn't accept the list of tuples returned from compute_gradients # so we unzip then zip tensors = [tensor for gradient, tensor in gradients] grads = [gradient for gradient, tensor in gradients] clipped_gradients, _ = tf.clip_by_global_norm(grads, self.global_norm_clipping) # returns list[tensors], norm clipped_grads_tensors = zip(clipped_gradients, tensors) self._tf_train_step = optimizer.apply_gradients(clipped_grads_tensors) # tflearn smartly knows how gradients are stored so we just pass in the list of tuples summarizer.summarize_gradients(clipped_grads_tensors) # tf learn auto merges all summaries so we just have to grab the last output self._tf_summaries = summarizer.summarize(self._tf_learning_rate, 'scalar', 'learning-rate')
def create_network_graph(self): input_shape = self._input_shape output_num = self._output_num # Input placeholders with tf.name_scope('input'): # we need to fix the input shape from (batch, filter, height, width) to # tensorflow which is (batch, height, width, filter) x_input_channel_firstdim = tf.placeholder(tf.uint8, [None] + input_shape, name='x-input') # transpose because tf wants channels on last dim and channels are passed in on 2nd dim x_input = tf.cast(tf.transpose(x_input_channel_firstdim, perm=[0, 2, 3, 1]), tf.float32) / 255.0 # transpose because tf wants channels on last dim and channels are passed in on 2nd dim x_actions = tf.placeholder(tf.int32, shape=[None], name='x-actions') x_rewards = tf.placeholder(tf.float32, shape=[None], name='x-rewards') with tf.variable_scope('network'): actor_output, critic_output, initial_lstm_state, new_lstm_state = self._network_generator(x_input, output_num) # flatten the critic_output NOTE: THIS IS VERY IMPORTANT # otherwise critic_output will be (batch_size, 1) and all ops with it and x_rewards will create a # tensor of shape (batch_size, batch_size) critic_output = tf.reshape(critic_output, [-1]) # # summarize a histogram of each action output # for output_ind in range(output_num): # summarizer.summarize(actor_output[:, output_ind], 'histogram', 'network-actor-output/{0}'.format(output_ind)) # # summarize critic output # summarizer.summarize(tf.reduce_mean(critic_output), 'scalar', 'network-critic-output') # # get the trainable variables for this network, later used to overwrite target network vars network_trainables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='network') # # summarize activations # summarizer.summarize_activations(tf.get_collection(tf.GraphKeys.ACTIVATIONS, scope='network')) # # add network summaries # summarizer.summarize_variables(train_vars=network_trainables) # caclulate losses with tf.name_scope('loss'): with tf.name_scope('critic-reward-diff'): critic_diff = tf.subtract(critic_output, x_rewards) with tf.name_scope('log-of-actor-policy'): # Because of https://github.com/tensorflow/tensorflow/issues/206 # we cannot use numpy like indexing so we convert to a one hot # multiply then take the max over last dim # NumPy/Theano est_rew = network_output[:, x_actions] x_actions_one_hot = tf.one_hot(x_actions, depth=output_num, name='one-hot', on_value=1.0, off_value=0.0, dtype=tf.float32) # we reduce sum here because the output could be negative we can't take the max # the other indecies will be 0 log_policy = tf.log(actor_output + 1e-6) log_policy_one_hot = tf.multiply(log_policy, x_actions_one_hot) log_policy_action = tf.reduce_sum(log_policy_one_hot, axis=1) with tf.name_scope('actor-entropy'): actor_entropy = tf.reduce_sum(tf.multiply(actor_output, log_policy)) summarizer.summarize(actor_entropy, 'scalar', 'actor-entropy') with tf.name_scope('actor-loss'): actor_loss = tf.reduce_sum(tf.multiply(log_policy_action, tf.stop_gradient(critic_diff))) summarizer.summarize(actor_loss, 'scalar', 'actor-loss') with tf.name_scope('critic-loss'): critic_loss = tf.nn.l2_loss(critic_diff) * 0.5 summarizer.summarize(critic_loss, 'scalar', 'critic-loss') with tf.name_scope('total-loss'): # NOTICE: we are summing gradients # NOTE: we are maximizing entropy # We want the network to not be sure of it's actions (entropy is highest with outputs not at 0 or 1) # https://www.wolframalpha.com/input/?i=log(x)+*+x total_loss = tf.reduce_sum(critic_loss + actor_loss + (actor_entropy * self._entropy_regularization)) summarizer.summarize(total_loss, 'scalar', 'total-loss') # optimizer with tf.name_scope('shared-optimizer'): tf_learning_rate = tf.placeholder(tf.float32) optimizer = self._optimizer_fn(learning_rate=tf_learning_rate) # only train the network vars with tf.name_scope('compute-clip-grads'): gradients = optimizer.compute_gradients(total_loss) # gradients are stored as a tuple, (gradient, tensor the gradient corresponds to) # kinda lame that clip by global norm doesn't accept the list of tuples returned from compute_gradients # so we unzip then zip tensors = [tensor for gradient, tensor in gradients] grads = [gradient for gradient, tensor in gradients] clipped_gradients, _ = tf.clip_by_global_norm(grads, self.global_norm_clipping) # returns list[tensors], norm clipped_grads_tensors = zip(clipped_gradients, tensors) tf_train_step = optimizer.apply_gradients(clipped_grads_tensors) # tflearn smartly knows how gradients are stored so we just pass in the list of tuples # summarizer.summarize_gradients(clipped_grads_tensors) # tf learn auto merges all summaries so we just have to grab the last one tf_summaries = summarizer.summarize(tf_learning_rate, 'scalar', 'learning-rate') # function to get network output def get_output(sess, state): feed_dict = {x_input_channel_firstdim: state, initial_lstm_state: self.prev_lstm_state} output, lstm_state = sess.run([actor_output, new_lstm_state], feed_dict=feed_dict) self.prev_lstm_state = lstm_state return get_action_from_probabilities(output[0]) # function to get mse feed dict def train_step(sess, states, actions, rewards, states_tp1, terminals, lstm_state, global_step=0, summaries=False): self.anneal_learning_rate(global_step) # nstep calculate TD reward if sum(terminals) > 1: raise ValueError('TD reward for mutiple terminal states in a batch is undefined') # last state not terminal need to query target network curr_reward = 0 if not terminals[-1]: # make a list to add back the first dim (needs to be 4 dims) target_feed_dict = {x_input_channel_firstdim: [states_tp1[-1]], initial_lstm_state: lstm_state} curr_reward = max(sess.run(critic_output, feed_dict=target_feed_dict)) # get bootstrap estimate of last state_tp1 td_rewards = [] for reward in reversed(rewards): curr_reward = reward + self._q_discount * curr_reward td_rewards.append(curr_reward) # td rewards is computed backward but other lists are stored forward so need to reverse td_rewards = list(reversed(td_rewards)) feed_dict = {x_input_channel_firstdim: states, x_actions: actions, x_rewards: td_rewards, tf_learning_rate: self.current_learning_rate, initial_lstm_state: lstm_state} if summaries: return sess.run([tf_summaries, tf_train_step], feed_dict=feed_dict)[0] else: return sess.run([tf_train_step], feed_dict=feed_dict) def reset_lstm_state(new_state=None): if new_state is not None: self.prev_lstm_state = new_state else: self.prev_lstm_state = (np.zeros((1, 256)), np.zeros((1, 256))) self._get_output = get_output self._train_step = train_step self._save_variables = network_trainables self.reset_lstm_state = reset_lstm_state
def create_network_graph(self, input_shape, output_num, network_generator, q_discount, optimizer, loss_clipping): # Input placeholders with tf.name_scope('input'): # we need to fix the input shape from (batch, filter, height, width) to # tensorflow which is (batch, height, width, filter) x_input_channel_firstdim = tf.placeholder(tf.uint8, [None] + input_shape, name='x-input') # transpose because tf wants channels on last dim and channels are passed in on 2nd dim x_input = tf.cast(tf.transpose(x_input_channel_firstdim, perm=[0, 2, 3, 1]), tf.float32) / 255.0 x_input_tp1_channel_firstdim = tf.placeholder(tf.uint8, [None] + input_shape, name='x-input-tp1') # transpose because tf wants channels on last dim and channels are passed in on 2nd dim x_input_tp1 = tf.cast(tf.transpose(x_input_tp1_channel_firstdim, perm=[0, 2, 3, 1]), tf.float32) / 255.0 x_actions = tf.placeholder(tf.int32, shape=[None], name='x-actions') x_rewards = tf.placeholder(tf.float32, shape=[None], name='x-rewards') x_terminals = tf.placeholder(tf.bool, shape=[None], name='x-terminals') x_discount = q_discount # Target network does not reuse variables. so we use two different variable scopes with tf.variable_scope('network'): network_output = network_generator(x_input, output_num) # summarize a histogram of each action output for output_ind in range(output_num): summarizer.summarize(network_output[:, output_ind], 'histogram', 'network-output/{0}'.format(output_ind)) # get the trainable variables for this network, later used to overwrite target network vars network_trainables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='network') # summarize activations summarizer.summarize_activations(tf.get_collection(tf.GraphKeys.ACTIVATIONS, scope='network')) # add network summaries summarizer.summarize_variables(train_vars=network_trainables) with tf.variable_scope('target-network'): target_network_output = network_generator(x_input_tp1, output_num) # get trainables for target network, used in assign op for the update target network step target_network_trainables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='target-network') # summarize activations summarizer.summarize_activations(tf.get_collection(tf.GraphKeys.ACTIVATIONS, scope='target-network')) # add network summaries summarizer.summarize_variables(train_vars=target_network_trainables) # update target network with network variables with tf.name_scope('update-target-network'): update_target_network_ops = [target_v.assign(v) for v, target_v in zip(network_trainables, target_network_trainables)] # caclulate QLoss with tf.name_scope('loss'): with tf.name_scope('estimated-reward-tp1'): one_minus_term = tf.mul(1.0 - tf.cast(x_terminals, tf.float32), x_discount) est_rew_tp1 = tf.mul(one_minus_term, tf.reduce_max(target_network_output, reduction_indices=1)) y = x_rewards + tf.stop_gradient(est_rew_tp1) with tf.name_scope('estimated-reward'): # Because of https://github.com/tensorflow/tensorflow/issues/206 # we cannot use numpy like indexing so we convert to a one hot # multiply then take the max over last dim # NumPy/Theano est_rew = network_output[:, x_actions] x_actions_one_hot = tf.one_hot(x_actions, depth=output_num, name='one-hot', on_value=1.0, off_value=0.0, dtype=tf.float32) # we reduce sum here because the output could be negative we can't take the max # the other indecies will be 0 est_rew = tf.reduce_sum(tf.mul(network_output, x_actions_one_hot), reduction_indices=1) with tf.name_scope('qloss'): # clip loss but keep linear past clip bounds # REFS: https://github.com/spragunr/deep_q_rl/blob/master/deep_q_rl/q_network.py#L108 # https://github.com/Jabberwockyll/deep_rl_ale/blob/master/q_network.py#L241 diff = y - est_rew if loss_clipping > 0.0: abs_diff = tf.abs(diff) # same as min(diff, loss_clipping) because diff can never be negative (definition of abs value) quadratic_part = tf.clip_by_value(abs_diff, 0.0, loss_clipping) linear_part = abs_diff - quadratic_part loss = (0.5 * tf.square(quadratic_part)) + (loss_clipping * linear_part) else: # But why multiply the loss by 0.5 when not clipping? https://groups.google.com/forum/#!topic/deep-q-learning/hKK0ZM_OWd4 loss = 0.5 * tf.square(diff) # NOTICE: we are summing gradients error = tf.reduce_sum(loss) summarizer.summarize(error, 'scalar', 'loss') # optimizer with tf.name_scope('shared-optimizer'): tf_learning_rate = tf.placeholder(tf.float32) optimizer = optimizer(learning_rate=tf_learning_rate) # only train the network vars not the target network tf_train_step = optimizer.minimize(error, var_list=network_trainables) # tf learn auto merges all summaries so we just have to grab the last output tf_summaries = summarizer.summarize(tf_learning_rate, 'scalar', 'learning-rate') # function to get network output def get_output(sess, state): feed_dict = {x_input_channel_firstdim: state} return sess.run([network_output], feed_dict=feed_dict) # function to get mse feed dict def train_step(sess, current_learning_rate, state, action, reward, state_tp1, terminal, summaries=False): feed_dict = {x_input_channel_firstdim: state, x_input_tp1_channel_firstdim: state_tp1, x_actions: action, x_rewards: reward, x_terminals: terminal, tf_learning_rate: current_learning_rate} if summaries: return sess.run([tf_summaries, tf_train_step], feed_dict=feed_dict)[0] else: return sess.run([tf_train_step], feed_dict=feed_dict) def update_target_net(sess): return sess.run([update_target_network_ops]) self._get_output = get_output self._train_step = train_step self._update_target_network = update_target_net self.saver = tf.train.Saver(var_list=network_trainables)
def create_network_graph(self): input_shape = self._input_shape output_num = self._output_num # Input placeholders with tf.name_scope('input'): # we need to fix the input shape from (batch, filter, height, width) to # tensorflow which is (batch, height, width, filter) self._t_x_input_channel_firstdim = tf.placeholder( tf.uint8, [None] + input_shape, name='x-input') # transpose because tf wants channels on last dim and channels are passed in on 2nd dim self._t_x_input = tf.cast( tf.transpose(self._t_x_input_channel_firstdim, perm=[0, 2, 3, 1]), tf.float32) / 255.0 self._t_x_input_tp1_channel_firstdim = tf.placeholder( tf.uint8, [None] + input_shape, name='x-input-tp1') # transpose because tf wants channels on last dim and channels are passed in on 2nd dim self._t_x_input_tp1 = tf.cast( tf.transpose(self._t_x_input_tp1_channel_firstdim, perm=[0, 2, 3, 1]), tf.float32) / 255.0 self._t_x_actions = tf.placeholder(tf.int32, shape=[None], name='x-actions') self._t_x_rewards = tf.placeholder(tf.float32, shape=[None], name='x-rewards') self._t_x_terminals = tf.placeholder(tf.bool, shape=[None], name='x-terminals') self._t_x_discount = self._q_discount # Target network does not reuse variables with tf.variable_scope('network') as var_scope: self._t_network_output = self._network_generator( self._t_x_input, output_num) # get the trainable variables for this network, later used to overwrite target network vars self._tf_network_trainables = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope='network') # summarize activations summarizer.summarize_activations( tf.get_collection(tf.GraphKeys.ACTIVATIONS, scope='network')) # if double DQN then we need to create network output for s_tp1 if self.algorithm_type == 'double' or self.algorithm_type == 'doublenstep': var_scope.reuse_variables() self._t_network_output_tp1 = self._network_generator( self._t_x_input_tp1, output_num) # summarize a histogram of each action output for output_ind in range(output_num): summarizer.summarize(self._t_network_output[:, output_ind], 'histogram', 'network-output/{0}'.format(output_ind)) # add network summaries summarizer.summarize_variables( train_vars=self._tf_network_trainables) with tf.variable_scope('target-network'): self._t_target_network_output = self._network_generator( self._t_x_input_tp1, output_num) # get trainables for target network, used in assign op for the update target network step target_network_trainables = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope='target-network') # update target network with network variables with tf.name_scope('update-target-network'): self._tf_update_target_network_ops = [ target_v.assign(v) for v, target_v in zip( self._tf_network_trainables, target_network_trainables) ] # if double convience function to get target values for online action if self.algorithm_type == 'double' or self.algorithm_type == 'doublenstep': with tf.name_scope('double_target'): # Target = target_Q(s_tp1, argmax(online_Q(s_tp1))) argmax_tp1 = tf.argmax(self._t_network_output_tp1, axis=1) self._t_target_value_online_action = tf_util.one_hot( self._t_target_network_output, argmax_tp1, output_num) # caclulate QLoss with tf.name_scope('loss'): # nstep rewards are calculated outside the gpu/graph because it requires a loop if self.algorithm_type != 'nstep' and self.algorithm_type != 'doublenstep': with tf.name_scope('estimated-reward-tp1'): if self.algorithm_type == 'double': # Target = target_Q(s_tp1, argmax(online_Q(s_tp1))) target = self._t_target_value_online_action elif self.algorithm_type == 'dqn': # Target = max(target_Q(s_tp1)) target = tf.reduce_max(self._t_target_network_output, axis=1) # compute a mask that returns gamma (discount factor) or 0 if terminal terminal_discount_mask = tf.multiply( 1.0 - tf.cast(self._t_x_terminals, tf.float32), self._t_x_discount) est_rew_tp1 = tf.multiply(terminal_discount_mask, target) y = self._t_x_rewards + tf.stop_gradient(est_rew_tp1) # else nstep else: y = self._t_x_rewards with tf.name_scope('estimated-reward'): est_rew = tf_util.one_hot(self._t_network_output, self._t_x_actions, output_num) with tf.name_scope('qloss'): # clip loss but keep linear past clip bounds (huber loss with customizable linear part) # REFS: https://github.com/spragunr/deep_q_rl/blob/master/deep_q_rl/q_network.py#L108 # https://github.com/Jabberwockyll/deep_rl_ale/blob/master/q_network.py#L241 diff = y - est_rew if self._loss_clipping > 0.0: abs_diff = tf.abs(diff) # same as min(diff, loss_clipping) because diff can never be negative (definition of abs value) quadratic_part = tf.clip_by_value(abs_diff, 0.0, self._loss_clipping) linear_part = abs_diff - quadratic_part loss = (0.5 * tf.square(quadratic_part)) + ( self._loss_clipping * linear_part) else: # But why multiply the loss by 0.5 when not clipping? https://groups.google.com/forum/#!topic/deep-q-learning/hKK0ZM_OWd4 loss = 0.5 * tf.square(diff) # NOTICE: we are summing gradients error = tf.reduce_sum(loss) summarizer.summarize(error, 'scalar', 'loss') # optimizer with tf.name_scope('shared-optimizer'): self._tf_learning_rate = tf.placeholder(tf.float32) optimizer = self._optimizer_fn( learning_rate=self._tf_learning_rate) # only train the network vars not the target network gradients = optimizer.compute_gradients( error, var_list=self._tf_network_trainables) # gradients are stored as a tuple, (gradient, tensor the gradient corresponds to) # kinda lame that clip by global norm doesn't accept the list of tuples returned from compute_gradients # so we unzip then zip tensors = [tensor for gradient, tensor in gradients] grads = [gradient for gradient, tensor in gradients] clipped_gradients, _ = tf.clip_by_global_norm( grads, self.global_norm_clipping) # returns list[tensors], norm clipped_grads_tensors = zip(clipped_gradients, tensors) self._tf_train_step = optimizer.apply_gradients( clipped_grads_tensors) # tflearn smartly knows how gradients are stored so we just pass in the list of tuples summarizer.summarize_gradients(clipped_grads_tensors) # tf learn auto merges all summaries so we just have to grab the last output self._tf_summaries = summarizer.summarize(self._tf_learning_rate, 'scalar', 'learning-rate')
def create_network_graph(self, input_shape, output_num, network_generator, q_discount, optimizer, loss_clipping): # Input placeholders with tf.name_scope('input'): # we need to fix the input shape from (batch, filter, height, width) to # tensorflow which is (batch, height, width, filter) x_input_channel_firstdim = tf.placeholder(tf.uint8, [None] + input_shape, name='x-input') # transpose because tf wants channels on last dim and channels are passed in on 2nd dim x_input = tf.cast(tf.transpose(x_input_channel_firstdim, perm=[0, 2, 3, 1]), tf.float32) / 255.0 x_input_tp1_channel_firstdim = tf.placeholder(tf.uint8, [None] + input_shape, name='x-input-tp1') # transpose because tf wants channels on last dim and channels are passed in on 2nd dim x_input_tp1 = tf.cast(tf.transpose(x_input_tp1_channel_firstdim, perm=[0, 2, 3, 1]), tf.float32) / 255.0 x_actions = tf.placeholder(tf.int32, shape=[None], name='x-actions') # TODO: SARSA only change x_actions_tp1 = tf.placeholder(tf.int32, shape=[None], name='x-actions-tp1') x_rewards = tf.placeholder(tf.float32, shape=[None], name='x-rewards') x_terminals = tf.placeholder(tf.bool, shape=[None], name='x-terminals') x_discount = q_discount # Target network does not reuse variables. so we use two different variable scopes with tf.variable_scope('network'): network_output = network_generator(x_input, output_num) # summarize a histogram of each action output for output_ind in range(output_num): summarizer.summarize(network_output[:, output_ind], 'histogram', 'network-output/{0}'.format(output_ind)) # get the trainable variables for this network, later used to overwrite target network vars network_trainables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='network') # summarize activations summarizer.summarize_activations(tf.get_collection(tf.GraphKeys.ACTIVATIONS, scope='network')) # add network summaries summarizer.summarize_variables(train_vars=network_trainables) with tf.variable_scope('target-network'): target_network_output = network_generator(x_input_tp1, output_num) # get trainables for target network, used in assign op for the update target network step target_network_trainables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='target-network') # summarize activations summarizer.summarize_activations(tf.get_collection(tf.GraphKeys.ACTIVATIONS, scope='target-network')) # add network summaries summarizer.summarize_variables(train_vars=target_network_trainables) # update target network with network variables with tf.name_scope('update-target-network'): update_target_network_ops = [target_v.assign(v) for v, target_v in zip(network_trainables, target_network_trainables)] # caclulate QLoss with tf.name_scope('loss'): with tf.name_scope('estimated-reward-tp1'): one_minus_term = tf.multiply(1.0 - tf.cast(x_terminals, tf.float32), x_discount) # TODO: SARSA only change # Sarsa uses the q estimate of the next state given action_tp1. Not the max # We must convert to one hot same as below # NumPy/Theano est_rew_tp1 = network_output[:, x_actions_tp1] x_actions_tp1_one_hot = tf.one_hot(x_actions_tp1, depth=output_num, name='one-hot-tp1', on_value=1.0, off_value=0.0, dtype=tf.float32) # we reduce sum here because the output could be negative we can't take the max # the other indecies will be 0 network_est_rew_tp1 = tf.reduce_sum(tf.multiply(target_network_output, x_actions_tp1_one_hot), axis=1) est_rew_tp1 = tf.multiply(one_minus_term, network_est_rew_tp1) y = x_rewards + tf.stop_gradient(est_rew_tp1) with tf.name_scope('estimated-reward'): # Because of https://github.com/tensorflow/tensorflow/issues/206 # we cannot use numpy like indexing so we convert to a one hot # multiply then take the max over last dim # NumPy/Theano est_rew = network_output[:, x_actions] x_actions_one_hot = tf.one_hot(x_actions, depth=output_num, name='one-hot', on_value=1.0, off_value=0.0, dtype=tf.float32) # we reduce sum here because the output could be negative we can't take the max # the other indecies will be 0 est_rew = tf.reduce_sum(tf.multiply(network_output, x_actions_one_hot), axis=1) with tf.name_scope('qloss'): # clip loss but keep linear past clip bounds # REFS: https://github.com/spragunr/deep_q_rl/blob/master/deep_q_rl/q_network.py#L108 # https://github.com/Jabberwockyll/deep_rl_ale/blob/master/q_network.py#L241 diff = y - est_rew if loss_clipping > 0.0: abs_diff = tf.abs(diff) # same as min(diff, loss_clipping) because diff can never be negative (definition of abs value) quadratic_part = tf.clip_by_value(abs_diff, 0.0, loss_clipping) linear_part = abs_diff - quadratic_part loss = (0.5 * tf.square(quadratic_part)) + (loss_clipping * linear_part) else: # But why multiply the loss by 0.5 when not clipping? https://groups.google.com/forum/#!topic/deep-q-learning/hKK0ZM_OWd4 loss = 0.5 * tf.square(diff) # NOTICE: we are summing gradients error = tf.reduce_sum(loss) summarizer.summarize(error, 'scalar', 'loss') # optimizer with tf.name_scope('shared-optimizer'): tf_learning_rate = tf.placeholder(tf.float32) optimizer = optimizer(learning_rate=tf_learning_rate) # only train the network vars not the target network tf_train_step = optimizer.minimize(error, var_list=network_trainables) # tf learn auto merges all summaries so we just have to grab the last output tf_summaries = summarizer.summarize(tf_learning_rate, 'scalar', 'learning-rate') # function to get network output def get_output(sess, state): feed_dict = {x_input_channel_firstdim: state} return sess.run([network_output], feed_dict=feed_dict) # function to get mse feed dict def train_step(sess, current_learning_rate, state, action, reward, state_tp1, action_tp1, terminal, summaries=False): feed_dict = {x_input_channel_firstdim: state, x_input_tp1_channel_firstdim: state_tp1, x_actions: action, x_actions_tp1: action_tp1, x_rewards: reward, x_terminals: terminal, # TODO: SARSA only change action_tp1 tf_learning_rate: current_learning_rate} if summaries: return sess.run([tf_summaries, tf_train_step], feed_dict=feed_dict)[0] else: return sess.run([tf_train_step], feed_dict=feed_dict) def update_target_net(sess): return sess.run([update_target_network_ops]) self._get_output = get_output self._train_step = train_step self._update_target_network = update_target_net self.saver = tf.train.Saver(var_list=network_trainables)