Exemplo n.º 1
0
 def test_summarizer(self):
     # Bulk Tests
     with tf.Graph().as_default():
         x = tf.placeholder("float", [None, 4])
         W = tf.Variable(tf.random_normal([4, 4]))
         x = tf.nn.tanh(tf.matmul(x, W))
         tf.add_to_collection(tf.GraphKeys.ACTIVATIONS, x)
         import tflearn.helpers.summarizer as s
         s.summarize_variables([W])
         s.summarize_activations(tf.get_collection(tf.GraphKeys.ACTIVATIONS))
         s.summarize(x, 'histogram', "test_summary")
Exemplo n.º 2
0
 def test_summarizer(self):
     # Bulk Tests
     with tf.Graph().as_default():
         x = tf.placeholder("float", [None, 4])
         W = tf.Variable(tf.random_normal([4, 4]))
         x = tf.nn.tanh(tf.matmul(x, W))
         tf.add_to_collection(tf.GraphKeys.ACTIVATIONS, x)
         import tflearn.helpers.summarizer as s
         s.summarize_variables([W])
         s.summarize_activations(tf.get_collection(
             tf.GraphKeys.ACTIVATIONS))
         s.summarize(x, 'histogram', "test_summary")
Exemplo n.º 3
0
    def create_network_graph(self):
        input_shape = self._input_shape
        output_num = self._output_num
        # Input placeholders
        with tf.name_scope('input'):
            # we need to fix the input shape from (batch, filter, height, width) to
            # tensorflow which is (batch, height, width, filter)
            x_input_channel_firstdim = tf.placeholder(tf.uint8, [None] + input_shape, name='x-input')
            # transpose because tf wants channels on last dim and channels are passed in on 2nd dim
            x_input = tf.cast(tf.transpose(x_input_channel_firstdim, perm=[0, 2, 3, 1]), tf.float32) / 255.0
            # transpose because tf wants channels on last dim and channels are passed in on 2nd dim
            x_actions = tf.placeholder(tf.int32, shape=[None], name='x-actions')
            x_rewards = tf.placeholder(tf.float32, shape=[None], name='x-rewards')

        with tf.variable_scope('network'):
            actor_output, critic_output = self._network_generator(x_input, output_num)
            # flatten the critic_output NOTE: THIS IS VERY IMPORTANT
            # otherwise critic_output will be (batch_size, 1) and all ops with it and x_rewards will create a
            # tensor of shape (batch_size, batch_size)
            critic_output = tf.reshape(critic_output, [-1])

            # # summarize a histogram of each action output
            # for output_ind in range(output_num):
            #     summarizer.summarize(actor_output[:, output_ind], 'histogram', 'network-actor-output/{0}'.format(output_ind))
            # # summarize critic output
            # summarizer.summarize(tf.reduce_mean(critic_output), 'scalar', 'network-critic-output')

            # # get the trainable variables for this network, later used to overwrite target network vars
            network_trainables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='network')

            # # summarize activations
            # summarizer.summarize_activations(tf.get_collection(tf.GraphKeys.ACTIVATIONS, scope='network'))

            # # add network summaries
            # summarizer.summarize_variables(train_vars=network_trainables)

        # caclulate losses
        with tf.name_scope('loss'):
            with tf.name_scope('critic-reward-diff'):
                critic_diff = tf.subtract(critic_output, x_rewards)

            with tf.name_scope('log-of-actor-policy'):
                # Because of https://github.com/tensorflow/tensorflow/issues/206
                # we cannot use numpy like indexing so we convert to a one hot
                # multiply then take the max over last dim
                # NumPy/Theano est_rew = network_output[:, x_actions]
                x_actions_one_hot = tf.one_hot(x_actions, depth=output_num, name='one-hot',
                                               on_value=1.0, off_value=0.0, dtype=tf.float32)
                # we reduce sum here because the output could be negative we can't take the max
                # the other indecies will be 0
                log_policy = tf.log(actor_output + 1e-6)
                log_policy_one_hot = tf.multiply(log_policy, x_actions_one_hot)
                log_policy_action = tf.reduce_sum(log_policy_one_hot, axis=1)

            with tf.name_scope('actor-entropy'):
                actor_entropy = tf.reduce_sum(tf.multiply(actor_output, log_policy))
                summarizer.summarize(actor_entropy, 'scalar', 'actor-entropy')

            with tf.name_scope('actor-loss'):
                actor_loss = tf.reduce_sum(tf.multiply(log_policy_action, tf.stop_gradient(critic_diff)))
                summarizer.summarize(actor_loss, 'scalar', 'actor-loss')

            with tf.name_scope('critic-loss'):
                critic_loss = tf.nn.l2_loss(critic_diff) * 0.5
                summarizer.summarize(critic_loss, 'scalar', 'critic-loss')

            with tf.name_scope('total-loss'):
                # NOTICE: we are summing gradients
                # NOTE: we are maximizing entropy
                # We want the network to not be sure of it's actions (entropy is highest with outputs not at 0 or 1)
                # https://www.wolframalpha.com/input/?i=log(x)+*+x
                total_loss = tf.reduce_sum(critic_loss + actor_loss + (actor_entropy * self._entropy_regularization))
                summarizer.summarize(total_loss, 'scalar', 'total-loss')

        # optimizer
        with tf.name_scope('shared-optimizer'):
            tf_learning_rate = tf.placeholder(tf.float32)
            optimizer = self._optimizer_fn(learning_rate=tf_learning_rate)
            # only train the network vars
            with tf.name_scope('compute-clip-grads'):
                gradients = optimizer.compute_gradients(total_loss)
                # gradients are stored as a tuple, (gradient, tensor the gradient corresponds to)
                # kinda lame that clip by global norm doesn't accept the list of tuples returned from compute_gradients
                # so we unzip then zip
                tensors = [tensor for gradient, tensor in gradients]
                grads = [gradient for gradient, tensor in gradients]
                clipped_gradients, _ = tf.clip_by_global_norm(grads, self.global_norm_clipping)  # returns list[tensors], norm
                clipped_grads_tensors = zip(clipped_gradients, tensors)
                tf_train_step = optimizer.apply_gradients(clipped_grads_tensors)
                # tflearn smartly knows how gradients are stored so we just pass in the list of tuples
                # summarizer.summarize_gradients(clipped_grads_tensors)

            # tf learn auto merges all summaries so we just have to grab the last one
            tf_summaries = summarizer.summarize(tf_learning_rate, 'scalar', 'learning-rate')

        # function to get network output
        def get_output(sess, state):
            feed_dict = {x_input_channel_firstdim: state}
            actor_out_values = sess.run(actor_output, feed_dict=feed_dict)[0]
            if self.deterministic:
                return np.argmax(actor_out_values)
            else:
                return get_action_from_probabilities(actor_out_values)

        # function to train network
        def train_step(sess, states, actions, rewards, states_tp1, terminals, global_step=0, summaries=False):
            self.anneal_learning_rate(global_step)

            # nstep calculate TD reward
            if sum(terminals) > 1:
                raise ValueError('TD reward for mutiple terminal states in a batch is undefined')

            # last state not terminal need to query target network
            curr_reward = 0
            if not terminals[-1]:
                target_feed_dict = {x_input_channel_firstdim: [states_tp1[-1]]}  # make a list to add back the first dim (needs to be 4 dims)
                curr_reward = max(sess.run(critic_output, feed_dict=target_feed_dict))

            # get bootstrap estimate of last state_tp1
            td_rewards = []
            for reward in reversed(rewards):
                curr_reward = reward + self._q_discount * curr_reward
                td_rewards.append(curr_reward)
            # td rewards is computed backward but other lists are stored forward so need to reverse
            td_rewards = list(reversed(td_rewards))
            feed_dict = {x_input_channel_firstdim: states, x_actions: actions, x_rewards: td_rewards,
                         tf_learning_rate: self.current_learning_rate}

            if summaries:
                return sess.run([tf_summaries, tf_train_step], feed_dict=feed_dict)[0]
            else:
                return sess.run([tf_train_step], feed_dict=feed_dict)

        self._get_output = get_output
        self._train_step = train_step
        self._save_variables = network_trainables
Exemplo n.º 4
0
    def create_network_graph(self):
        input_shape = self._input_shape
        output_num = self._output_num
        # Input placeholders
        with tf.name_scope('input'):
            # we need to fix the input shape from (batch, filter, height, width) to
            # tensorflow which is (batch, height, width, filter)
            self._t_x_input_channel_firstdim = tf.placeholder(tf.uint8, [None] + input_shape, name='x-input')
            # transpose because tf wants channels on last dim and channels are passed in on 2nd dim
            self._t_x_input = tf.cast(tf.transpose(self._t_x_input_channel_firstdim, perm=[0, 2, 3, 1]), tf.float32) / 255.0
            self._t_x_input_tp1_channel_firstdim = tf.placeholder(tf.uint8, [None] + input_shape, name='x-input-tp1')
            # transpose because tf wants channels on last dim and channels are passed in on 2nd dim
            self._t_x_input_tp1 = tf.cast(tf.transpose(self._t_x_input_tp1_channel_firstdim, perm=[0, 2, 3, 1]), tf.float32) / 255.0
            self._t_x_actions = tf.placeholder(tf.int32, shape=[None], name='x-actions')
            self._t_x_rewards = tf.placeholder(tf.float32, shape=[None], name='x-rewards')
            self._t_x_terminals = tf.placeholder(tf.bool, shape=[None], name='x-terminals')
            self._t_x_discount = self._q_discount

        # Target network does not reuse variables
        with tf.variable_scope('network') as var_scope:
            self._t_network_output = self._network_generator(self._t_x_input, output_num)

            # get the trainable variables for this network, later used to overwrite target network vars
            self._tf_network_trainables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='network')

            # summarize activations
            summarizer.summarize_activations(tf.get_collection(tf.GraphKeys.ACTIVATIONS, scope='network'))

            # if double DQN then we need to create network output for s_tp1
            if self.algorithm_type == 'double' or self.algorithm_type == 'doublenstep':
                var_scope.reuse_variables()
                self._t_network_output_tp1 = self._network_generator(self._t_x_input_tp1, output_num)

            # summarize a histogram of each action output
            for output_ind in range(output_num):
                summarizer.summarize(self._t_network_output[:, output_ind], 'histogram', 'network-output/{0}'.format(output_ind))

            # add network summaries
            summarizer.summarize_variables(train_vars=self._tf_network_trainables)

        with tf.variable_scope('target-network'):
            self._t_target_network_output = self._network_generator(self._t_x_input_tp1, output_num)

            # get trainables for target network, used in assign op for the update target network step
            target_network_trainables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='target-network')

        # update target network with network variables
        with tf.name_scope('update-target-network'):
            self._tf_update_target_network_ops = [target_v.assign(v) for v, target_v in zip(self._tf_network_trainables, target_network_trainables)]

        # if double convience function to get target values for online action
        if self.algorithm_type == 'double' or self.algorithm_type == 'doublenstep':
            with tf.name_scope('double_target'):
                # Target = target_Q(s_tp1, argmax(online_Q(s_tp1)))
                argmax_tp1 = tf.argmax(self._t_network_output_tp1, axis=1)
                self._t_target_value_online_action = tf_util.one_hot(self._t_target_network_output, argmax_tp1, output_num)

        # caclulate QLoss
        with tf.name_scope('loss'):
            # nstep rewards are calculated outside the gpu/graph because it requires a loop
            if self.algorithm_type != 'nstep' and self.algorithm_type != 'doublenstep':
                with tf.name_scope('estimated-reward-tp1'):
                    if self.algorithm_type == 'double':
                        # Target = target_Q(s_tp1, argmax(online_Q(s_tp1)))
                        target = self._t_target_value_online_action
                    elif self.algorithm_type == 'dqn':
                        # Target = max(target_Q(s_tp1))
                        target = tf.reduce_max(self._t_target_network_output, axis=1)

                    # compute a mask that returns gamma (discount factor) or 0 if terminal
                    terminal_discount_mask = tf.multiply(1.0 - tf.cast(self._t_x_terminals, tf.float32), self._t_x_discount)
                    est_rew_tp1 = tf.multiply(terminal_discount_mask, target)

                y = self._t_x_rewards + tf.stop_gradient(est_rew_tp1)
            # else nstep
            else:
                y = self._t_x_rewards

            with tf.name_scope('estimated-reward'):
                est_rew = tf_util.one_hot(self._t_network_output, self._t_x_actions, output_num)

            with tf.name_scope('qloss'):
                # clip loss but keep linear past clip bounds (huber loss with customizable linear part)
                # REFS: https://github.com/spragunr/deep_q_rl/blob/master/deep_q_rl/q_network.py#L108
                # https://github.com/Jabberwockyll/deep_rl_ale/blob/master/q_network.py#L241
                diff = y - est_rew

                if self._loss_clipping > 0.0:
                    abs_diff = tf.abs(diff)
                    # same as min(diff, loss_clipping) because diff can never be negative (definition of abs value)
                    quadratic_part = tf.clip_by_value(abs_diff, 0.0, self._loss_clipping)
                    linear_part = abs_diff - quadratic_part
                    loss = (0.5 * tf.square(quadratic_part)) + (self._loss_clipping * linear_part)
                else:
                    # But why multiply the loss by 0.5 when not clipping? https://groups.google.com/forum/#!topic/deep-q-learning/hKK0ZM_OWd4
                    loss = 0.5 * tf.square(diff)
                # NOTICE: we are summing gradients
                error = tf.reduce_sum(loss)
            summarizer.summarize(error, 'scalar', 'loss')

        # optimizer
        with tf.name_scope('shared-optimizer'):
            self._tf_learning_rate = tf.placeholder(tf.float32)
            optimizer = self._optimizer_fn(learning_rate=self._tf_learning_rate)
            # only train the network vars not the target network
            gradients = optimizer.compute_gradients(error, var_list=self._tf_network_trainables)
            # gradients are stored as a tuple, (gradient, tensor the gradient corresponds to)
            # kinda lame that clip by global norm doesn't accept the list of tuples returned from compute_gradients
            # so we unzip then zip
            tensors = [tensor for gradient, tensor in gradients]
            grads = [gradient for gradient, tensor in gradients]
            clipped_gradients, _ = tf.clip_by_global_norm(grads, self.global_norm_clipping)  # returns list[tensors], norm
            clipped_grads_tensors = zip(clipped_gradients, tensors)
            self._tf_train_step = optimizer.apply_gradients(clipped_grads_tensors)
            # tflearn smartly knows how gradients are stored so we just pass in the list of tuples
            summarizer.summarize_gradients(clipped_grads_tensors)

            # tf learn auto merges all summaries so we just have to grab the last output
            self._tf_summaries = summarizer.summarize(self._tf_learning_rate, 'scalar', 'learning-rate')
Exemplo n.º 5
0
    def create_network_graph(self):
        input_shape = self._input_shape
        output_num = self._output_num
        # Input placeholders
        with tf.name_scope('input'):
            # we need to fix the input shape from (batch, filter, height, width) to
            # tensorflow which is (batch, height, width, filter)
            x_input_channel_firstdim = tf.placeholder(tf.uint8, [None] + input_shape, name='x-input')
            # transpose because tf wants channels on last dim and channels are passed in on 2nd dim
            x_input = tf.cast(tf.transpose(x_input_channel_firstdim, perm=[0, 2, 3, 1]), tf.float32) / 255.0
            # transpose because tf wants channels on last dim and channels are passed in on 2nd dim
            x_actions = tf.placeholder(tf.int32, shape=[None], name='x-actions')
            x_rewards = tf.placeholder(tf.float32, shape=[None], name='x-rewards')

        with tf.variable_scope('network'):
            actor_output, critic_output, initial_lstm_state, new_lstm_state = self._network_generator(x_input, output_num)
            # flatten the critic_output NOTE: THIS IS VERY IMPORTANT
            # otherwise critic_output will be (batch_size, 1) and all ops with it and x_rewards will create a
            # tensor of shape (batch_size, batch_size)
            critic_output = tf.reshape(critic_output, [-1])

            # # summarize a histogram of each action output
            # for output_ind in range(output_num):
            #     summarizer.summarize(actor_output[:, output_ind], 'histogram', 'network-actor-output/{0}'.format(output_ind))
            # # summarize critic output
            # summarizer.summarize(tf.reduce_mean(critic_output), 'scalar', 'network-critic-output')

            # # get the trainable variables for this network, later used to overwrite target network vars
            network_trainables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='network')

            # # summarize activations
            # summarizer.summarize_activations(tf.get_collection(tf.GraphKeys.ACTIVATIONS, scope='network'))

            # # add network summaries
            # summarizer.summarize_variables(train_vars=network_trainables)

        # caclulate losses
        with tf.name_scope('loss'):
            with tf.name_scope('critic-reward-diff'):
                critic_diff = tf.subtract(critic_output, x_rewards)

            with tf.name_scope('log-of-actor-policy'):
                # Because of https://github.com/tensorflow/tensorflow/issues/206
                # we cannot use numpy like indexing so we convert to a one hot
                # multiply then take the max over last dim
                # NumPy/Theano est_rew = network_output[:, x_actions]
                x_actions_one_hot = tf.one_hot(x_actions, depth=output_num, name='one-hot',
                                               on_value=1.0, off_value=0.0, dtype=tf.float32)
                # we reduce sum here because the output could be negative we can't take the max
                # the other indecies will be 0
                log_policy = tf.log(actor_output + 1e-6)
                log_policy_one_hot = tf.multiply(log_policy, x_actions_one_hot)
                log_policy_action = tf.reduce_sum(log_policy_one_hot, axis=1)

            with tf.name_scope('actor-entropy'):
                actor_entropy = tf.reduce_sum(tf.multiply(actor_output, log_policy))
                summarizer.summarize(actor_entropy, 'scalar', 'actor-entropy')

            with tf.name_scope('actor-loss'):
                actor_loss = tf.reduce_sum(tf.multiply(log_policy_action, tf.stop_gradient(critic_diff)))
                summarizer.summarize(actor_loss, 'scalar', 'actor-loss')

            with tf.name_scope('critic-loss'):
                critic_loss = tf.nn.l2_loss(critic_diff) * 0.5
                summarizer.summarize(critic_loss, 'scalar', 'critic-loss')

            with tf.name_scope('total-loss'):
                # NOTICE: we are summing gradients
                # NOTE: we are maximizing entropy
                # We want the network to not be sure of it's actions (entropy is highest with outputs not at 0 or 1)
                # https://www.wolframalpha.com/input/?i=log(x)+*+x
                total_loss = tf.reduce_sum(critic_loss + actor_loss + (actor_entropy * self._entropy_regularization))
                summarizer.summarize(total_loss, 'scalar', 'total-loss')

        # optimizer
        with tf.name_scope('shared-optimizer'):
            tf_learning_rate = tf.placeholder(tf.float32)
            optimizer = self._optimizer_fn(learning_rate=tf_learning_rate)
            # only train the network vars
            with tf.name_scope('compute-clip-grads'):
                gradients = optimizer.compute_gradients(total_loss)
                # gradients are stored as a tuple, (gradient, tensor the gradient corresponds to)
                # kinda lame that clip by global norm doesn't accept the list of tuples returned from compute_gradients
                # so we unzip then zip
                tensors = [tensor for gradient, tensor in gradients]
                grads = [gradient for gradient, tensor in gradients]
                clipped_gradients, _ = tf.clip_by_global_norm(grads, self.global_norm_clipping)  # returns list[tensors], norm
                clipped_grads_tensors = zip(clipped_gradients, tensors)
                tf_train_step = optimizer.apply_gradients(clipped_grads_tensors)
                # tflearn smartly knows how gradients are stored so we just pass in the list of tuples
                # summarizer.summarize_gradients(clipped_grads_tensors)

            # tf learn auto merges all summaries so we just have to grab the last one
            tf_summaries = summarizer.summarize(tf_learning_rate, 'scalar', 'learning-rate')

        # function to get network output
        def get_output(sess, state):
            feed_dict = {x_input_channel_firstdim: state, initial_lstm_state: self.prev_lstm_state}
            output, lstm_state = sess.run([actor_output, new_lstm_state], feed_dict=feed_dict)
            self.prev_lstm_state = lstm_state
            return get_action_from_probabilities(output[0])

        # function to get mse feed dict
        def train_step(sess, states, actions, rewards, states_tp1, terminals, lstm_state, global_step=0, summaries=False):
            self.anneal_learning_rate(global_step)

            # nstep calculate TD reward
            if sum(terminals) > 1:
                raise ValueError('TD reward for mutiple terminal states in a batch is undefined')

            # last state not terminal need to query target network
            curr_reward = 0
            if not terminals[-1]:
                # make a list to add back the first dim (needs to be 4 dims)
                target_feed_dict = {x_input_channel_firstdim: [states_tp1[-1]],
                                    initial_lstm_state: lstm_state}
                curr_reward = max(sess.run(critic_output, feed_dict=target_feed_dict))

            # get bootstrap estimate of last state_tp1
            td_rewards = []
            for reward in reversed(rewards):
                curr_reward = reward + self._q_discount * curr_reward
                td_rewards.append(curr_reward)
            # td rewards is computed backward but other lists are stored forward so need to reverse
            td_rewards = list(reversed(td_rewards))
            feed_dict = {x_input_channel_firstdim: states, x_actions: actions, x_rewards: td_rewards,
                         tf_learning_rate: self.current_learning_rate, initial_lstm_state: lstm_state}

            if summaries:
                return sess.run([tf_summaries, tf_train_step], feed_dict=feed_dict)[0]
            else:
                return sess.run([tf_train_step], feed_dict=feed_dict)

        def reset_lstm_state(new_state=None):
            if new_state is not None:
                self.prev_lstm_state = new_state
            else:
                self.prev_lstm_state = (np.zeros((1, 256)), np.zeros((1, 256)))

        self._get_output = get_output
        self._train_step = train_step
        self._save_variables = network_trainables
        self.reset_lstm_state = reset_lstm_state
    def create_network_graph(self, input_shape, output_num, network_generator, q_discount, optimizer, loss_clipping):
        # Input placeholders
        with tf.name_scope('input'):
            # we need to fix the input shape from (batch, filter, height, width) to
            # tensorflow which is (batch, height, width, filter)
            x_input_channel_firstdim = tf.placeholder(tf.uint8, [None] + input_shape, name='x-input')
            # transpose because tf wants channels on last dim and channels are passed in on 2nd dim
            x_input = tf.cast(tf.transpose(x_input_channel_firstdim, perm=[0, 2, 3, 1]), tf.float32) / 255.0
            x_input_tp1_channel_firstdim = tf.placeholder(tf.uint8, [None] + input_shape, name='x-input-tp1')
            # transpose because tf wants channels on last dim and channels are passed in on 2nd dim
            x_input_tp1 = tf.cast(tf.transpose(x_input_tp1_channel_firstdim, perm=[0, 2, 3, 1]), tf.float32) / 255.0
            x_actions = tf.placeholder(tf.int32, shape=[None], name='x-actions')
            x_rewards = tf.placeholder(tf.float32, shape=[None], name='x-rewards')
            x_terminals = tf.placeholder(tf.bool, shape=[None], name='x-terminals')
            x_discount = q_discount

        # Target network does not reuse variables. so we use two different variable scopes
        with tf.variable_scope('network'):
            network_output = network_generator(x_input, output_num)

            # summarize a histogram of each action output
            for output_ind in range(output_num):
                summarizer.summarize(network_output[:, output_ind], 'histogram', 'network-output/{0}'.format(output_ind))

            # get the trainable variables for this network, later used to overwrite target network vars
            network_trainables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='network')

            # summarize activations
            summarizer.summarize_activations(tf.get_collection(tf.GraphKeys.ACTIVATIONS, scope='network'))

            # add network summaries
            summarizer.summarize_variables(train_vars=network_trainables)

        with tf.variable_scope('target-network'):
            target_network_output = network_generator(x_input_tp1, output_num)

            # get trainables for target network, used in assign op for the update target network step
            target_network_trainables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='target-network')

            # summarize activations
            summarizer.summarize_activations(tf.get_collection(tf.GraphKeys.ACTIVATIONS, scope='target-network'))

            # add network summaries
            summarizer.summarize_variables(train_vars=target_network_trainables)

        # update target network with network variables
        with tf.name_scope('update-target-network'):
            update_target_network_ops = [target_v.assign(v) for v, target_v in zip(network_trainables, target_network_trainables)]

        # caclulate QLoss
        with tf.name_scope('loss'):
            with tf.name_scope('estimated-reward-tp1'):
                one_minus_term = tf.mul(1.0 - tf.cast(x_terminals, tf.float32), x_discount)
                est_rew_tp1 = tf.mul(one_minus_term, tf.reduce_max(target_network_output, reduction_indices=1))

            y = x_rewards + tf.stop_gradient(est_rew_tp1)

            with tf.name_scope('estimated-reward'):
                # Because of https://github.com/tensorflow/tensorflow/issues/206
                # we cannot use numpy like indexing so we convert to a one hot
                # multiply then take the max over last dim
                # NumPy/Theano est_rew = network_output[:, x_actions]
                x_actions_one_hot = tf.one_hot(x_actions, depth=output_num, name='one-hot',
                                               on_value=1.0, off_value=0.0, dtype=tf.float32)
                # we reduce sum here because the output could be negative we can't take the max
                # the other indecies will be 0
                est_rew = tf.reduce_sum(tf.mul(network_output, x_actions_one_hot), reduction_indices=1)

            with tf.name_scope('qloss'):
                # clip loss but keep linear past clip bounds
                # REFS: https://github.com/spragunr/deep_q_rl/blob/master/deep_q_rl/q_network.py#L108
                # https://github.com/Jabberwockyll/deep_rl_ale/blob/master/q_network.py#L241
                diff = y - est_rew

                if loss_clipping > 0.0:
                    abs_diff = tf.abs(diff)
                    # same as min(diff, loss_clipping) because diff can never be negative (definition of abs value)
                    quadratic_part = tf.clip_by_value(abs_diff, 0.0, loss_clipping)
                    linear_part = abs_diff - quadratic_part
                    loss = (0.5 * tf.square(quadratic_part)) + (loss_clipping * linear_part)
                else:
                    # But why multiply the loss by 0.5 when not clipping? https://groups.google.com/forum/#!topic/deep-q-learning/hKK0ZM_OWd4
                    loss = 0.5 * tf.square(diff)
                # NOTICE: we are summing gradients
                error = tf.reduce_sum(loss)
            summarizer.summarize(error, 'scalar', 'loss')

        # optimizer
        with tf.name_scope('shared-optimizer'):
            tf_learning_rate = tf.placeholder(tf.float32)
            optimizer = optimizer(learning_rate=tf_learning_rate)
            # only train the network vars not the target network
            tf_train_step = optimizer.minimize(error, var_list=network_trainables)

            # tf learn auto merges all summaries so we just have to grab the last output
            tf_summaries = summarizer.summarize(tf_learning_rate, 'scalar', 'learning-rate')

        # function to get network output
        def get_output(sess, state):
            feed_dict = {x_input_channel_firstdim: state}
            return sess.run([network_output], feed_dict=feed_dict)

        # function to get mse feed dict
        def train_step(sess, current_learning_rate, state, action, reward, state_tp1, terminal, summaries=False):
            feed_dict = {x_input_channel_firstdim: state, x_input_tp1_channel_firstdim: state_tp1,
                         x_actions: action, x_rewards: reward, x_terminals: terminal,
                         tf_learning_rate: current_learning_rate}
            if summaries:
                return sess.run([tf_summaries, tf_train_step], feed_dict=feed_dict)[0]
            else:
                return sess.run([tf_train_step], feed_dict=feed_dict)

        def update_target_net(sess):
            return sess.run([update_target_network_ops])

        self._get_output = get_output
        self._train_step = train_step
        self._update_target_network = update_target_net
        self.saver = tf.train.Saver(var_list=network_trainables)
Exemplo n.º 7
0
    def create_network_graph(self):
        input_shape = self._input_shape
        output_num = self._output_num
        # Input placeholders
        with tf.name_scope('input'):
            # we need to fix the input shape from (batch, filter, height, width) to
            # tensorflow which is (batch, height, width, filter)
            self._t_x_input_channel_firstdim = tf.placeholder(
                tf.uint8, [None] + input_shape, name='x-input')
            # transpose because tf wants channels on last dim and channels are passed in on 2nd dim
            self._t_x_input = tf.cast(
                tf.transpose(self._t_x_input_channel_firstdim,
                             perm=[0, 2, 3, 1]), tf.float32) / 255.0
            self._t_x_input_tp1_channel_firstdim = tf.placeholder(
                tf.uint8, [None] + input_shape, name='x-input-tp1')
            # transpose because tf wants channels on last dim and channels are passed in on 2nd dim
            self._t_x_input_tp1 = tf.cast(
                tf.transpose(self._t_x_input_tp1_channel_firstdim,
                             perm=[0, 2, 3, 1]), tf.float32) / 255.0
            self._t_x_actions = tf.placeholder(tf.int32,
                                               shape=[None],
                                               name='x-actions')
            self._t_x_rewards = tf.placeholder(tf.float32,
                                               shape=[None],
                                               name='x-rewards')
            self._t_x_terminals = tf.placeholder(tf.bool,
                                                 shape=[None],
                                                 name='x-terminals')
            self._t_x_discount = self._q_discount

        # Target network does not reuse variables
        with tf.variable_scope('network') as var_scope:
            self._t_network_output = self._network_generator(
                self._t_x_input, output_num)

            # get the trainable variables for this network, later used to overwrite target network vars
            self._tf_network_trainables = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES, scope='network')

            # summarize activations
            summarizer.summarize_activations(
                tf.get_collection(tf.GraphKeys.ACTIVATIONS, scope='network'))

            # if double DQN then we need to create network output for s_tp1
            if self.algorithm_type == 'double' or self.algorithm_type == 'doublenstep':
                var_scope.reuse_variables()
                self._t_network_output_tp1 = self._network_generator(
                    self._t_x_input_tp1, output_num)

            # summarize a histogram of each action output
            for output_ind in range(output_num):
                summarizer.summarize(self._t_network_output[:, output_ind],
                                     'histogram',
                                     'network-output/{0}'.format(output_ind))

            # add network summaries
            summarizer.summarize_variables(
                train_vars=self._tf_network_trainables)

        with tf.variable_scope('target-network'):
            self._t_target_network_output = self._network_generator(
                self._t_x_input_tp1, output_num)

            # get trainables for target network, used in assign op for the update target network step
            target_network_trainables = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES, scope='target-network')

        # update target network with network variables
        with tf.name_scope('update-target-network'):
            self._tf_update_target_network_ops = [
                target_v.assign(v) for v, target_v in zip(
                    self._tf_network_trainables, target_network_trainables)
            ]

        # if double convience function to get target values for online action
        if self.algorithm_type == 'double' or self.algorithm_type == 'doublenstep':
            with tf.name_scope('double_target'):
                # Target = target_Q(s_tp1, argmax(online_Q(s_tp1)))
                argmax_tp1 = tf.argmax(self._t_network_output_tp1, axis=1)
                self._t_target_value_online_action = tf_util.one_hot(
                    self._t_target_network_output, argmax_tp1, output_num)

        # caclulate QLoss
        with tf.name_scope('loss'):
            # nstep rewards are calculated outside the gpu/graph because it requires a loop
            if self.algorithm_type != 'nstep' and self.algorithm_type != 'doublenstep':
                with tf.name_scope('estimated-reward-tp1'):
                    if self.algorithm_type == 'double':
                        # Target = target_Q(s_tp1, argmax(online_Q(s_tp1)))
                        target = self._t_target_value_online_action
                    elif self.algorithm_type == 'dqn':
                        # Target = max(target_Q(s_tp1))
                        target = tf.reduce_max(self._t_target_network_output,
                                               axis=1)

                    # compute a mask that returns gamma (discount factor) or 0 if terminal
                    terminal_discount_mask = tf.multiply(
                        1.0 - tf.cast(self._t_x_terminals, tf.float32),
                        self._t_x_discount)
                    est_rew_tp1 = tf.multiply(terminal_discount_mask, target)

                y = self._t_x_rewards + tf.stop_gradient(est_rew_tp1)
            # else nstep
            else:
                y = self._t_x_rewards

            with tf.name_scope('estimated-reward'):
                est_rew = tf_util.one_hot(self._t_network_output,
                                          self._t_x_actions, output_num)

            with tf.name_scope('qloss'):
                # clip loss but keep linear past clip bounds (huber loss with customizable linear part)
                # REFS: https://github.com/spragunr/deep_q_rl/blob/master/deep_q_rl/q_network.py#L108
                # https://github.com/Jabberwockyll/deep_rl_ale/blob/master/q_network.py#L241
                diff = y - est_rew

                if self._loss_clipping > 0.0:
                    abs_diff = tf.abs(diff)
                    # same as min(diff, loss_clipping) because diff can never be negative (definition of abs value)
                    quadratic_part = tf.clip_by_value(abs_diff, 0.0,
                                                      self._loss_clipping)
                    linear_part = abs_diff - quadratic_part
                    loss = (0.5 * tf.square(quadratic_part)) + (
                        self._loss_clipping * linear_part)
                else:
                    # But why multiply the loss by 0.5 when not clipping? https://groups.google.com/forum/#!topic/deep-q-learning/hKK0ZM_OWd4
                    loss = 0.5 * tf.square(diff)
                # NOTICE: we are summing gradients
                error = tf.reduce_sum(loss)
            summarizer.summarize(error, 'scalar', 'loss')

        # optimizer
        with tf.name_scope('shared-optimizer'):
            self._tf_learning_rate = tf.placeholder(tf.float32)
            optimizer = self._optimizer_fn(
                learning_rate=self._tf_learning_rate)
            # only train the network vars not the target network
            gradients = optimizer.compute_gradients(
                error, var_list=self._tf_network_trainables)
            # gradients are stored as a tuple, (gradient, tensor the gradient corresponds to)
            # kinda lame that clip by global norm doesn't accept the list of tuples returned from compute_gradients
            # so we unzip then zip
            tensors = [tensor for gradient, tensor in gradients]
            grads = [gradient for gradient, tensor in gradients]
            clipped_gradients, _ = tf.clip_by_global_norm(
                grads,
                self.global_norm_clipping)  # returns list[tensors], norm
            clipped_grads_tensors = zip(clipped_gradients, tensors)
            self._tf_train_step = optimizer.apply_gradients(
                clipped_grads_tensors)
            # tflearn smartly knows how gradients are stored so we just pass in the list of tuples
            summarizer.summarize_gradients(clipped_grads_tensors)

            # tf learn auto merges all summaries so we just have to grab the last output
            self._tf_summaries = summarizer.summarize(self._tf_learning_rate,
                                                      'scalar',
                                                      'learning-rate')
    def create_network_graph(self, input_shape, output_num, network_generator, q_discount, optimizer, loss_clipping):
        # Input placeholders
        with tf.name_scope('input'):
            # we need to fix the input shape from (batch, filter, height, width) to
            # tensorflow which is (batch, height, width, filter)
            x_input_channel_firstdim = tf.placeholder(tf.uint8, [None] + input_shape, name='x-input')
            # transpose because tf wants channels on last dim and channels are passed in on 2nd dim
            x_input = tf.cast(tf.transpose(x_input_channel_firstdim, perm=[0, 2, 3, 1]), tf.float32) / 255.0
            x_input_tp1_channel_firstdim = tf.placeholder(tf.uint8, [None] + input_shape, name='x-input-tp1')
            # transpose because tf wants channels on last dim and channels are passed in on 2nd dim
            x_input_tp1 = tf.cast(tf.transpose(x_input_tp1_channel_firstdim, perm=[0, 2, 3, 1]), tf.float32) / 255.0
            x_actions = tf.placeholder(tf.int32, shape=[None], name='x-actions')
            # TODO: SARSA only change
            x_actions_tp1 = tf.placeholder(tf.int32, shape=[None], name='x-actions-tp1')
            x_rewards = tf.placeholder(tf.float32, shape=[None], name='x-rewards')
            x_terminals = tf.placeholder(tf.bool, shape=[None], name='x-terminals')
            x_discount = q_discount

        # Target network does not reuse variables. so we use two different variable scopes
        with tf.variable_scope('network'):
            network_output = network_generator(x_input, output_num)

            # summarize a histogram of each action output
            for output_ind in range(output_num):
                summarizer.summarize(network_output[:, output_ind], 'histogram', 'network-output/{0}'.format(output_ind))

            # get the trainable variables for this network, later used to overwrite target network vars
            network_trainables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='network')

            # summarize activations
            summarizer.summarize_activations(tf.get_collection(tf.GraphKeys.ACTIVATIONS, scope='network'))

            # add network summaries
            summarizer.summarize_variables(train_vars=network_trainables)

        with tf.variable_scope('target-network'):
            target_network_output = network_generator(x_input_tp1, output_num)

            # get trainables for target network, used in assign op for the update target network step
            target_network_trainables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='target-network')

            # summarize activations
            summarizer.summarize_activations(tf.get_collection(tf.GraphKeys.ACTIVATIONS, scope='target-network'))

            # add network summaries
            summarizer.summarize_variables(train_vars=target_network_trainables)

        # update target network with network variables
        with tf.name_scope('update-target-network'):
            update_target_network_ops = [target_v.assign(v) for v, target_v in zip(network_trainables, target_network_trainables)]

        # caclulate QLoss
        with tf.name_scope('loss'):
            with tf.name_scope('estimated-reward-tp1'):
                one_minus_term = tf.multiply(1.0 - tf.cast(x_terminals, tf.float32), x_discount)
                # TODO: SARSA only change
                # Sarsa uses the q estimate of the next state given action_tp1. Not the max
                # We must convert to one hot same as below
                # NumPy/Theano est_rew_tp1 = network_output[:, x_actions_tp1]
                x_actions_tp1_one_hot = tf.one_hot(x_actions_tp1, depth=output_num, name='one-hot-tp1',
                                                   on_value=1.0, off_value=0.0, dtype=tf.float32)
                # we reduce sum here because the output could be negative we can't take the max
                # the other indecies will be 0
                network_est_rew_tp1 = tf.reduce_sum(tf.multiply(target_network_output, x_actions_tp1_one_hot), axis=1)
                est_rew_tp1 = tf.multiply(one_minus_term, network_est_rew_tp1)

            y = x_rewards + tf.stop_gradient(est_rew_tp1)

            with tf.name_scope('estimated-reward'):
                # Because of https://github.com/tensorflow/tensorflow/issues/206
                # we cannot use numpy like indexing so we convert to a one hot
                # multiply then take the max over last dim
                # NumPy/Theano est_rew = network_output[:, x_actions]
                x_actions_one_hot = tf.one_hot(x_actions, depth=output_num, name='one-hot',
                                               on_value=1.0, off_value=0.0, dtype=tf.float32)
                # we reduce sum here because the output could be negative we can't take the max
                # the other indecies will be 0
                est_rew = tf.reduce_sum(tf.multiply(network_output, x_actions_one_hot), axis=1)

            with tf.name_scope('qloss'):
                # clip loss but keep linear past clip bounds
                # REFS: https://github.com/spragunr/deep_q_rl/blob/master/deep_q_rl/q_network.py#L108
                # https://github.com/Jabberwockyll/deep_rl_ale/blob/master/q_network.py#L241
                diff = y - est_rew

                if loss_clipping > 0.0:
                    abs_diff = tf.abs(diff)
                    # same as min(diff, loss_clipping) because diff can never be negative (definition of abs value)
                    quadratic_part = tf.clip_by_value(abs_diff, 0.0, loss_clipping)
                    linear_part = abs_diff - quadratic_part
                    loss = (0.5 * tf.square(quadratic_part)) + (loss_clipping * linear_part)
                else:
                    # But why multiply the loss by 0.5 when not clipping? https://groups.google.com/forum/#!topic/deep-q-learning/hKK0ZM_OWd4
                    loss = 0.5 * tf.square(diff)
                # NOTICE: we are summing gradients
                error = tf.reduce_sum(loss)
            summarizer.summarize(error, 'scalar', 'loss')

        # optimizer
        with tf.name_scope('shared-optimizer'):
            tf_learning_rate = tf.placeholder(tf.float32)
            optimizer = optimizer(learning_rate=tf_learning_rate)
            # only train the network vars not the target network
            tf_train_step = optimizer.minimize(error, var_list=network_trainables)

            # tf learn auto merges all summaries so we just have to grab the last output
            tf_summaries = summarizer.summarize(tf_learning_rate, 'scalar', 'learning-rate')

        # function to get network output
        def get_output(sess, state):
            feed_dict = {x_input_channel_firstdim: state}
            return sess.run([network_output], feed_dict=feed_dict)

        # function to get mse feed dict
        def train_step(sess, current_learning_rate, state, action, reward, state_tp1, action_tp1, terminal, summaries=False):
            feed_dict = {x_input_channel_firstdim: state, x_input_tp1_channel_firstdim: state_tp1,
                         x_actions: action, x_actions_tp1: action_tp1, x_rewards: reward, x_terminals: terminal,
                         # TODO: SARSA only change action_tp1
                         tf_learning_rate: current_learning_rate}
            if summaries:
                return sess.run([tf_summaries, tf_train_step], feed_dict=feed_dict)[0]
            else:
                return sess.run([tf_train_step], feed_dict=feed_dict)

        def update_target_net(sess):
            return sess.run([update_target_network_ops])

        self._get_output = get_output
        self._train_step = train_step
        self._update_target_network = update_target_net
        self.saver = tf.train.Saver(var_list=network_trainables)