예제 #1
0
    def __init__(self, name, obs_size=2, action_size=2, actor_hidden_size=32, critic_hidden_size=32, pow_learning_rate=0.001,  
                   RB_learning_rate=0.001, entropy_cost=0.01, normalise_entropy=True, lambda_=0., baseline_cost=1.):
    
        with tf.variable_scope(name):
            # hyperparameter bootstrap_n determines the batch size
            self.name=name
            self.input_ = tf.placeholder(tf.float32, [None, obs_size], name='inputs')
            self.action_RB_ = tf.placeholder(tf.int32, [None, 1], name='action_RB')
            self.action_pow_ = tf.placeholder(tf.int32, [None, 1], name='action_pow')
            self.reward_ = tf.placeholder(tf.float32, [None, 1], name='reward')
            self.discount_ = tf.placeholder(tf.float32, [None, 1], name='discount')
            self.bootstrap_ = tf.placeholder(tf.float32, [None], name='bootstrap')

            # set up actor network
            self.fc1_actor_ = tf.contrib.layers.fully_connected(self.input_, actor_hidden_size, activation_fn=tf.nn.elu)

            self.fc2_actor_ = tf.contrib.layers.fully_connected(self.fc1_actor_, actor_hidden_size, activation_fn=tf.nn.elu)

            self.fc3_actor_power_ = tf.contrib.layers.fully_connected(self.fc2_actor_, ch.D2D_tr_Power_levels, activation_fn=None)            
            self.fc3_actor_RB_ = tf.contrib.layers.fully_connected(self.fc2_actor_, ch.N_CU, activation_fn=None)

            # reshape the policy logits
            self.policy_logits_RB_ = tf.reshape(self.fc3_actor_RB_, (-1, 1, ch.N_CU))
            self.policy_logits_power_ = tf.reshape(self.fc3_actor_power_, (-1, 1, ch.D2D_tr_Power_levels))
             
            # self.policy_logits_ = self.combine_tensors(self.fc3_actor_power_, self.fc3_actor_RB_) # TO-DO figure out how to have 2 seperate outputs for each action type
            # might need to do two seperate updates - policy logtis for both pow_sel and RB_sel
            #print(self.policy_logits_)
            #self.policy_logits_ = tf.transpose(self.policy_logits_)
            #self.policy_logits_ = tf.expand_dims(self.policy_logits_, axis=0)
            #self.policy_logits_ = tf.reshape(self.policy_logits_, [-1, 1, action_size]) # these resize it weirdly
            # generate action probabilities for taking actions
            self.action_prob_power_ = tf.nn.softmax(self.fc3_actor_power_)
            self.action_prob_RB_ = tf.nn.softmax(self.fc3_actor_RB_)
            
      
            # set up critic network
            self.fc1_critic_ = tf.contrib.layers.fully_connected(self.input_, critic_hidden_size, activation_fn=tf.nn.elu)

            self.fc2_critic_ = tf.contrib.layers.fully_connected(self.fc1_critic_, critic_hidden_size, activation_fn=tf.nn.elu)

            self.baseline_ = tf.contrib.layers.fully_connected(self.fc2_critic_, 1, activation_fn=None)
      
            # Define Loss with TRFL
            self.seq_aac_return_pow_ = trfl.sequence_advantage_actor_critic_loss(self.policy_logits_power_, self.baseline_, self.action_pow_,
                self.reward_, self.discount_, self.bootstrap_, lambda_=lambda_, entropy_cost=entropy_cost, 
                baseline_cost=baseline_cost, normalise_entropy=normalise_entropy)          

            self.seq_aac_return_RB_ = trfl.sequence_advantage_actor_critic_loss(self.policy_logits_RB_, self.baseline_, self.action_RB_,
                self.reward_, self.discount_, self.bootstrap_, lambda_=lambda_, entropy_cost=entropy_cost, 
                baseline_cost=baseline_cost, normalise_entropy=normalise_entropy)

            # Optimize the loss
            self.ac_loss_pow_ = tf.reduce_mean(self.seq_aac_return_pow_.loss)
            self.ac_loss_RB_ = tf.reduce_mean(self.seq_aac_return_RB_.loss)
            self.ac_optim_pow_ = tf.train.AdamOptimizer(learning_rate=pow_learning_rate).minimize(self.ac_loss_pow_)
            self.ac_optim_RB_ = tf.train.AdamOptimizer(learning_rate=RB_learning_rate).minimize(self.ac_loss_RB_)
예제 #2
0
    def __init__(self,
                 name,
                 obs_size=2,
                 action_size=2,
                 actor_hidden_size=32,
                 critic_hidden_size=32,
                 ac_learning_rate=0.001,
                 entropy_cost=0.01,
                 normalise_entropy=True,
                 lambda_=0.,
                 baseline_cost=1.):

        with tf.variable_scope(name):
            # hyperparameter bootstrap_n determines the batch size
            self.name = name
            self.input_ = tf.placeholder(tf.float32, [None, obs_size],
                                         name='inputs')
            self.action_ = tf.placeholder(tf.int32, [None, 1], name='action')
            self.reward_ = tf.placeholder(tf.float32, [None, 1], name='reward')
            self.discount_ = tf.placeholder(tf.float32, [None, 1],
                                            name='discount')
            self.bootstrap_ = tf.placeholder(tf.float32, [None],
                                             name='bootstrap')

            # set up actor network
            self.fc1_actor_ = tf.contrib.layers.fully_connected(
                self.input_, actor_hidden_size, activation_fn=tf.nn.relu)
            self.fc2_actor_ = tf.contrib.layers.fully_connected(
                self.fc1_actor_, action_size, activation_fn=tf.nn.softmax)
            # reshape the policy logits
            self.policy_logits_ = tf.reshape(self.fc2_actor_,
                                             [-1, 1, action_size])

            # generate action probabilities for taking actions
            self.action_prob_ = tf.nn.softmax(self.fc2_actor_)

            # set up critic network
            self.fc1_critic_ = tf.contrib.layers.fully_connected(
                self.input_, critic_hidden_size, activation_fn=tf.nn.relu)
            self.baseline_ = tf.contrib.layers.fully_connected(
                self.fc1_critic_, 1, activation_fn=None)

            # TRFL usage
            self.seq_aac_return_ = trfl.sequence_advantage_actor_critic_loss(
                self.policy_logits_,
                self.baseline_,
                self.action_,
                self.reward_,
                self.discount_,
                self.bootstrap_,
                lambda_=lambda_,
                entropy_cost=entropy_cost,
                baseline_cost=baseline_cost,
                normalise_entropy=normalise_entropy)

            # Optimize the loss
            self.ac_loss_ = tf.reduce_mean(self.seq_aac_return_.loss)
            self.ac_optim_ = tf.train.AdamOptimizer(
                learning_rate=ac_learning_rate).minimize(self.ac_loss_)
예제 #3
0
    def get_loss(self, x, at, r):
        # TODO change to PPO?
        T, B = tf.shape(r)

        h = tf.map_fn(self.net, x)
        policy_logits = h[:, :, :self.n_actions]
        v = h[:, :, self.n_actions]

        pg_loss, extra = trfl.sequence_advantage_actor_critic_loss(
            policy_logits=policy_logits,
            baseline_values=v,
            actions=at,
            rewards=r,
            pcontinues=0.99 * tf.ones_like(r),
            bootstrap_value=tf.ones(tf.shape(r)[1]),
            entropy_cost=1e-1,
            normalise_entropy=True)

        loss = tf.reduce_mean(pg_loss)
        tf.contrib.summary.scalar('loss/policy', loss)
        return loss
예제 #4
0
    def train_step(self, x, a, r):
        """
        Train on a batch of data.

        x: (B, T)
        a: (B, T)
        r: (B, T)
        """
        x = tf.constant(x,
                        shape=[self.batch_size, self.time_steps, 1],
                        dtype=tf.float32)
        a = tf.constant(a,
                        shape=[self.batch_size, self.time_steps, 1],
                        dtype=tf.float32)
        r = tf.constant(r,
                        shape=[self.batch_size, self.time_steps, 1],
                        dtype=tf.float32)

        # current obs, old action, old reward
        inputs = tf.concat([x[:, 1:, :], a[:, :-1:, ], r[:, :-1, :]], axis=-1)

        actions_taken = tf.transpose(tf.squeeze(tf.cast(a[:, 1:, :],
                                                        tf.int32)))
        rewards_received = tf.transpose(r[:, 1:, 0])
        returns = tf.reduce_sum(rewards_received, axis=0)

        with tf.GradientTape() as tape:

            logits, v = self.forward(inputs)
            # NOTE does this loss fn correct for its off policy nature?
            policy_loss, extra = trfl.sequence_advantage_actor_critic_loss(
                policy_logits=logits,
                baseline_values=
                v[...,
                  0],  # Q how can A2C be extended with distributional estimates of value?
                actions=actions_taken,
                rewards=rewards_received,
                pcontinues=0.99 * tf.ones_like(rewards_received),
                bootstrap_value=returns,
                entropy_cost=1.0,
                lambda_=0.5)
            beta = tf.constant(1.0 * 0.5**(self.global_step / 2000))
            beta = tf.cast(beta, tf.float32)

            loss = tf.reduce_mean(0.05 * extra.baseline_loss +
                                  extra.policy_gradient_loss +
                                  beta * extra.entropy_loss)

        variables = self.nn.variables + self.rnn.variables
        grads = tape.gradient(loss, variables)
        self.opt.apply_gradients(zip(grads, variables),
                                 global_step=self.global_step)

        with tf.contrib.summary.record_summaries_every_n_global_steps(10):
            tf.contrib.summary.scalar('loss', loss)
            tf.contrib.summary.scalar('extra/baseline_loss',
                                      tf.reduce_mean(extra.baseline_loss))
            tf.contrib.summary.scalar(
                'extra/policy_gradient_loss',
                tf.reduce_mean(extra.policy_gradient_loss))
            tf.contrib.summary.scalar('extra/entropy_loss',
                                      tf.reduce_mean(extra.entropy_loss))
            tf.contrib.summary.scalar('total_R', tf.reduce_sum(r))
            tf.contrib.summary.histogram('actions', a)
예제 #5
0
파일: a2c_lstm_2.py 프로젝트: Amineh-A/lab
    def __init__(self, name, num_envs=4, n=20, entropy_reg_term=.05, 
        normalise_entropy=False, max_steps=5, learning_rate=0.001):
        # Network parameters
        # Hardcoded so as not to be global now
        self.state_size = (80,80,3)
        action_size = 6

        kernel_size_1 = [8,8,3]
        output_filters_conv1 = 32     
        output_filters_conv2 = 64     
        output_filters_conv3 = 64     
        hidden_size = 512               # number of units in hidden layer
        lstm_size = 256
        tf.reset_default_graph() 
        with tf.variable_scope(name):
            self.name = name

            # Input images
            self.inputs_ = tf.placeholder(tf.float32, [None, self.state_size[0], 
                self.state_size[1], self.state_size[2]], name='inputs_')

            # One hot encode the actions:
            # [look_left, look_right, strafe_left, strafe_right, forward, backward]
            self.actions = tf.placeholder(tf.int32, [None, num_envs], name='actions')
            self.rewards = tf.placeholder(tf.float32, [None, num_envs], name='rewards')
            
            # Conv layers
            self.conv1 = tf.contrib.layers.conv2d(self.inputs_, output_filters_conv1, kernel_size=8, stride=2)
            self.conv2 = tf.contrib.layers.conv2d(self.conv1, output_filters_conv2, kernel_size=4, stride=2)
            self.conv3 = tf.contrib.layers.conv2d(self.conv2, output_filters_conv3, kernel_size=4, stride=1)

            # Constructing input to AC network
            self.actions_input = tf.reshape(tf.one_hot(self.actions, action_size), [-1, action_size])
            self.rewards_input = tf.reshape(self.rewards, [-1, 1])
            self.vision_input = tf.reshape(self.conv3, [-1, self.conv3.shape[1]*self.conv3.shape[2]*self.conv3.shape[3]])

            self.ac_input = tf.concat([self.actions_input, self.rewards_input, self.vision_input], axis=1)

            # FC Layer
            self.fc1 = tf.contrib.layers.fully_connected(self.ac_input, hidden_size)

            # LSTM Layer
            self.lstm_cell = tf.nn.rnn_cell.LSTMCell(lstm_size, state_is_tuple=False)
            self.lstm_hidden_state_input = tf.placeholder_with_default(
                                        self.lstm_cell.zero_state(batch_size=num_envs, dtype=tf.float32),
                                        [num_envs, hidden_size])
            # Should be lstm_size not hidden_size?


            self.lstm_input = tf.reshape(self.fc1, [-1, num_envs, hidden_size])

            # Dynamic RNN code - might not need to be dynamic
            self.lstm_output, self.lstm_hidden_state_output = tf.nn.dynamic_rnn(
                self.lstm_cell,
                self.lstm_input,
                initial_state=self.lstm_hidden_state_input,
                dtype=tf.float32,
                time_major=True,
                # parallel_iterations=num_envs, # Note: not sure what these do
                # swap_memory=True, # Note: not sure what these do
            )

            self.lstm_output_flat = tf.reshape(self.lstm_output, [-1, lstm_size])


            # Value function - Linear output layer
            self.value_output = tf.contrib.layers.fully_connected(self.lstm_output_flat, 1, 
                                                            activation_fn=None)

            # Policy - softmax output layer
            self.policy_logits = tf.contrib.layers.fully_connected(self.lstm_output_flat, action_size, activation_fn=None)
            self.policy_output = tf.contrib.layers.softmax(self.policy_logits)
            # Action sampling op
            self.action_output = tf.squeeze(tf.multinomial(logits=self.policy_logits,num_samples=1), axis=1)

            # Used for TRFL stuff
            self.value_output_unflat = tf.reshape(self.value_output, [n, num_envs])
            self.policy_logits_unflat = tf.reshape(self.policy_logits, [n, num_envs, -1])

            self.discounts = tf.placeholder(tf.float32,[n, num_envs],name="discounts")
            self.initial_Rs = tf.placeholder(tf.float32, [num_envs], name="initial_Rs")

            #TRFL loss
            a2c_loss, extra = trfl.sequence_advantage_actor_critic_loss(
                policy_logits = self.policy_logits_unflat,
                baseline_values = self.value_output_unflat, 
                actions = self.actions, 
                rewards = self.rewards,
                pcontinues = self.discounts, 
                bootstrap_value = self.initial_Rs,
                entropy_cost = entropy_reg_term,
                normalise_entropy = normalise_entropy)
            self.loss = tf.reduce_mean(a2c_loss)
            self.extra = extra
            self.opt = tf.train.AdamOptimizer(learning_rate).minimize(self.loss)

            print("Network shapes:")
            print("inputs_: ", self.inputs_.shape)
            print("actions: ", self.actions.shape)
            print("conv1: ", self.conv1.shape)
            print("conv2: ", self.conv2.shape)
            print("conv3: ", self.conv3.shape)
            print("ac_input: ", self.ac_input.shape)
            print("fc1: ", self.fc1.shape)
            print("lstm_hidden_state_input: ", self.lstm_hidden_state_input.shape)
            print("lstm_input: ", self.lstm_input.shape)
            print("lstm_hidden_state_output: ", self.lstm_hidden_state_output.shape)
            print("lstm_output: ", self.lstm_output.shape)
            print("lstm_output_flat: ", self.lstm_output_flat.shape)
            print("value_output: ", self.value_output.shape)
            print("policy_logits: ", self.policy_logits.shape)
            print("policy_output: ", self.policy_output.shape)
            print("value_output_unflat: ", self.value_output_unflat.shape)
            print("policy_logits_unflat: ", self.policy_logits_unflat.shape)


            print(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES))
            graph = tf.get_default_graph()

            # TODO: get this some other way?
            conv1_w_summary = tf.summary.histogram('conv1 weights',graph.get_tensor_by_name("main_acn/Conv/weights:0"))
            conv1_b_summary = tf.summary.histogram('conv1 biases',graph.get_tensor_by_name("main_acn/Conv/biases:0"))
            conv2_w_summary = tf.summary.histogram('conv2 weights',graph.get_tensor_by_name("main_acn/Conv_1/weights:0"))
            conv2_b_summary = tf.summary.histogram('conv2 biases',graph.get_tensor_by_name("main_acn/Conv_1/biases:0"))
            conv3_w_summary = tf.summary.histogram('conv2 weights',graph.get_tensor_by_name("main_acn/Conv_2/weights:0"))
            conv3_b_summary = tf.summary.histogram('conv3 biases',graph.get_tensor_by_name("main_acn/Conv_2/biases:0"))
            fc1_w_summary = tf.summary.histogram('fc1 weights',graph.get_tensor_by_name("main_acn/fully_connected/weights:0"))
            fc1_b_summary = tf.summary.histogram('fc1 biases',graph.get_tensor_by_name("main_acn/fully_connected/biases:0"))
            lstm_w_summary = tf.summary.histogram('lstm weights',graph.get_tensor_by_name("main_acn/rnn/lstm_cell/kernel:0"))
            lstm_b_summary = tf.summary.histogram('lstm biases',graph.get_tensor_by_name("main_acn/rnn/lstm_cell/bias:0"))
            value_w_summary = tf.summary.histogram('value weights',graph.get_tensor_by_name("main_acn/fully_connected_1/weights:0"))
            value_b_summary = tf.summary.histogram('value biases',graph.get_tensor_by_name("main_acn/fully_connected_1/biases:0"))
            policy_w_summary = tf.summary.histogram('policy weights',graph.get_tensor_by_name("main_acn/fully_connected_2/weights:0"))
            policy_b_summary = tf.summary.histogram('policy biases',graph.get_tensor_by_name("main_acn/fully_connected_2/biases:0"))

            # Tensorboard
            self.average_reward_metric = tf.placeholder(tf.float32, name="average_reward")
            # self.average_length_of_episode = tf.placeholder(tf.float32, name="average_length_of_episode")

            conv1_summary = tf.summary.histogram('conv1', self.conv1)
            conv2_summary = tf.summary.histogram('conv2', self.conv2)
            conv3_summary = tf.summary.histogram('conv3', self.conv3)
            fc1_summary = tf.summary.histogram('fc1', self.fc1)
            # lstm_summary = tf.summary.histogram('lstm', self.lstm_cell)

            policy_summary = tf.summary.tensor_summary('policy', self.policy_output)
            reward_summary = tf.summary.scalar('average_reward_metric', self.average_reward_metric)
            loss_summary = tf.summary.scalar('loss', self.loss)
            entropy_summary = tf.summary.scalar('policy_entropy', tf.math.reduce_mean(self.extra.entropy))
            baseline_loss_summary = tf.summary.scalar('baseline_loss', tf.math.reduce_mean(self.extra.baseline_loss))
            entropy_loss_summary = tf.summary.scalar('entropy_loss', tf.math.reduce_mean(self.extra.entropy_loss))
            policy_gradient_loss = tf.summary.scalar('policy_gradient_loss', tf.math.reduce_mean(self.extra.policy_gradient_loss))
            
            self.train_step_summary = tf.summary.merge([
                reward_summary,
                loss_summary,
                entropy_summary,
                baseline_loss_summary,
                entropy_loss_summary,
                policy_gradient_loss,
                conv1_w_summary,
                conv1_b_summary,
                conv2_w_summary,
                conv2_b_summary,
                conv3_w_summary,
                conv3_b_summary,
                fc1_w_summary,
                fc1_b_summary,
                lstm_w_summary,
                lstm_b_summary,
                value_w_summary,
                value_b_summary,
                policy_w_summary,
                policy_b_summary
                ])

            self.action_step_summary = tf.summary.merge([policy_summary])
예제 #6
0
    def __init__(self,
                 name,
                 obs_size=2,
                 action_size=2,
                 actor_hidden_size=32,
                 critic_hidden_size=32,
                 ac_learning_rate=0.001,
                 entropy_cost=0.01,
                 normalise_entropy=True,
                 lambda_=0.,
                 baseline_cost=1.):

        with tf.variable_scope(name):
            # network variables
            self.name = name
            self.input_ = tf.placeholder(tf.float32, [None, obs_size],
                                         name='inputs')
            self.action_ = tf.placeholder(tf.int32, [None, 1], name='action')
            self.reward_ = tf.placeholder(tf.float32, [None, 1], name='reward')
            self.discount_ = tf.placeholder(tf.float32, [None, 1],
                                            name='discount')
            self.bootstrap_ = tf.placeholder(tf.float32, [None],
                                             name='bootstrap')

            # set up actor network (approximates optimal policy)
            self.fc1_actor_ = tf.contrib.layers.fully_connected(
                self.input_, actor_hidden_size, activation_fn=tf.nn.elu)
            self.fc2_actor_ = tf.contrib.layers.fully_connected(
                self.fc1_actor_, actor_hidden_size, activation_fn=tf.nn.elu)
            self.fc3_actor_ = tf.contrib.layers.fully_connected(
                self.fc2_actor_, action_size, activation_fn=None)
            # reshape the policy logits
            self.policy_logits_ = tf.reshape(self.fc3_actor_,
                                             [-1, 1, action_size])

            # generate action probabilities for taking actions
            self.action_prob_ = tf.nn.softmax(self.fc3_actor_)

            # set up critic network (approximates optimal state-value function (used as a baseline to reduce variance of loss gradient))
            # - uses policy evaluation (e.g. Monte-Carlo / TD learning) to estimate the advantage
            self.fc1_critic_ = tf.contrib.layers.fully_connected(
                self.input_, critic_hidden_size, activation_fn=tf.nn.elu)
            self.fc2_critic_ = tf.contrib.layers.fully_connected(
                self.fc1_critic_, critic_hidden_size, activation_fn=tf.nn.elu)
            self.baseline_ = tf.contrib.layers.fully_connected(
                self.fc2_critic_, 1, activation_fn=None)

            # Calculates the loss for an A2C update along a batch of trajectories. (TRFL)
            self.seq_aac_return_ = trfl.sequence_advantage_actor_critic_loss(
                self.policy_logits_,
                self.baseline_,
                self.action_,
                self.reward_,
                self.discount_,
                self.bootstrap_,
                lambda_=lambda_,
                entropy_cost=entropy_cost,
                baseline_cost=baseline_cost,
                normalise_entropy=normalise_entropy)

            # Optimize the loss
            self.ac_loss_ = tf.reduce_mean(self.seq_aac_return_.loss)
            self.ac_optim_ = tf.train.AdamOptimizer(
                learning_rate=ac_learning_rate).minimize(self.ac_loss_)
예제 #7
0
    def __init__(self, name):
        # state inputs to the Q-network
        with tf.variable_scope(name):
            # self.inputs_ = tf.placeholder(tf.float32, [n, num_envs, state_size[0], state_size[1], state_size[2]], name='inputs')
            # self.inputs_flat = tf.reshape(self.inputs_, [n * num_envs, state_size[0], state_size[1], state_size[2]])

            self.inputs_ = tf.placeholder(
                tf.float32,
                [None, state_size[0], state_size[1], state_size[2]],
                name='inputs')
            # Actions for the QNetwork:
            # One-hot vector, with each action being as follows:
            # (look_left, look_right, strafe_left, strafe_right, forward, backward)
            # These are mapped to the deepmind-lab (not one-hot) actions with the same names
            # defined in ACTIONS

            # One hot encode the actions to later choose the Q-value for the action
            self.actions_ = tf.placeholder(tf.int32, [n, num_envs],
                                           name='actions')

            # ReLU hidden layers
            self.conv1 = tf.contrib.layers.conv2d(self.inputs_,
                                                  output_filters_conv1,
                                                  kernel_size=8,
                                                  stride=2)
            self.conv2 = tf.contrib.layers.conv2d(self.conv1,
                                                  output_filters_conv2,
                                                  kernel_size=4,
                                                  stride=2)
            self.conv3 = tf.contrib.layers.conv2d(self.conv2,
                                                  output_filters_conv3,
                                                  kernel_size=4,
                                                  stride=1)

            self.fc1 = tf.contrib.layers.fully_connected( \
              tf.reshape(self.conv3, [-1, self.conv3.shape[1]*self.conv3.shape[2]*self.conv3.shape[3]]), \
              hidden_size)

            # Value function - Linear output layer
            self.value_output = tf.contrib.layers.fully_connected(
                self.fc1, 1, activation_fn=None)

            # Policy - softmax output layer
            self.policy_logits = tf.contrib.layers.fully_connected(
                self.fc1, action_size, activation_fn=None)
            self.policy_output = tf.contrib.layers.softmax(self.policy_logits)

            self.action_output = tf.squeeze(tf.multinomial(
                logits=self.policy_logits, num_samples=1),
                                            axis=1)

            self.name = name

            self.rewards = tf.placeholder(tf.float32, [n, num_envs],
                                          name="rewards")
            self.discounts = tf.placeholder(tf.float32, [n, num_envs],
                                            name="discounts")
            self.initial_Rs = tf.placeholder(tf.float32, [num_envs],
                                             name="initial_Rs")

            # Used for trfl stuff
            self.value_output_unflat = tf.reshape(self.value_output,
                                                  [n, num_envs])
            self.policy_logits_unflat = tf.reshape(self.policy_logits,
                                                   [n, num_envs, -1])

            print("Network shapes:")
            print("actions_: ", self.actions_.shape)
            print("conv1: ", self.conv1.shape)
            print("conv2: ", self.conv2.shape)
            print("conv3: ", self.conv3.shape)
            print("fc1: ", self.fc1.shape)
            print("value_output: ", self.value_output.shape)
            print("policy_logits: ", self.policy_logits.shape)
            print("policy_output: ", self.policy_output.shape)

            print("policy_logits_unflat: ", self.policy_logits_unflat.shape)
            print("value_output_unflat: ", self.value_output_unflat.shape)

            #TRFL qlearning
            a2c_loss, extra = trfl.sequence_advantage_actor_critic_loss(
                policy_logits=self.policy_logits_unflat,
                baseline_values=self.value_output_unflat,
                actions=self.actions_,
                rewards=self.rewards,
                pcontinues=self.discounts,
                bootstrap_value=self.initial_Rs,
                entropy_cost=entropy_reg_term,
                normalise_entropy=normalise_entropy)
            self.loss = tf.reduce_mean(a2c_loss)
            self.extra = extra
            self.opt = tf.train.AdamOptimizer(learning_rate).minimize(
                self.loss)
예제 #8
0
    def __init__(self,
                 name,
                 state_size=2,
                 action_size=2,
                 actor_hidden_size=32,
                 critic_hidden_size=32,
                 ac_learning_rate=0.001,
                 entropy_cost=0.01,
                 normalise_entropy=True,
                 lambda_=0.,
                 baseline_cost=1.):

        with tf.variable_scope(name):

            # placeholders
            self.name = name
            self.input = tf.placeholder(tf.float32, [None, state_size],
                                        name='input')
            self.action = tf.placeholder(tf.int32, [None, 1], name='action')
            self.reward = tf.placeholder(tf.float32, [None, 1], name='reward')
            self.discount = tf.placeholder(tf.float32, [None, 1],
                                           name='discount')
            self.bootstrap = tf.placeholder(tf.float32, [None],
                                            name='bootstrap')

            # actor net
            self.actor_hidden_1 = tf.contrib.layers.fully_connected(
                self.input, actor_hidden_size, activation_fn=tf.nn.relu)
            self.actor_hidden_2 = tf.contrib.layers.fully_connected(
                self.actor_hidden_1,
                actor_hidden_size,
                activation_fn=tf.nn.relu)
            self.actor_out = tf.contrib.layers.fully_connected(
                self.actor_hidden_2, action_size, activation_fn=None)

            # policy logits
            self.policy_logits = tf.reshape(self.actor_out,
                                            [-1, 1, action_size],
                                            name='policy_logits')

            # action choice
            self.action_choice = tf.nn.softmax(self.actor_out)

            # critic net
            self.critic_hidden_1 = tf.contrib.layers.fully_connected(
                self.input, critic_hidden_size, activation_fn=tf.nn.relu)
            self.critic_hidden_2 = tf.contrib.layers.fully_connected(
                self.critic_hidden_1,
                critic_hidden_size,
                activation_fn=tf.nn.relu)
            self.critic_out = tf.contrib.layers.fully_connected(
                self.critic_hidden_2, critic_hidden_size, activation_fn=None)

            # loss function
            self.acloss = trfl.sequence_advantage_actor_critic_loss(
                self.policy_logits,
                self.critic_out,
                self.action,
                self.reward,
                self.discount,
                self.bootstrap,
                lambda_=lambda_,
                entropy_cost=entropy_cost,
                baseline_cost=baseline_cost,
                normalise_entropy=normalise_entropy)

            # optimizer
            self.ac_optim = tf.reduce_mean(self.acloss.loss)
            self.optimizer = tf.train.AdamOptimizer(
                learning_rate=ac_learning_rate).minimize(self.ac_optim)
예제 #9
0
    def loss(self, observations, rewards, actions, additional_rewards=None):
        """Compute the loss."""
        dummy_zeroth_step_actions = tf.zeros_like(actions[:1])
        all_actions = tf.concat([dummy_zeroth_step_actions, actions], axis=0)
        inputs, features = snt.BatchApply(self._encode)(observations, rewards,
                                                        all_actions)

        rewards = rewards[1:]  # Zeroth step reward not correlated to actions.
        if additional_rewards is not None:
            # Additional rewards are not passed to the encoder (above) in order to be
            # consistent with the step, nor to the recon loss so that recons are
            # consistent with the observations. Thus, additional rewards only affect
            # the returns used to learn the value function.
            rewards += additional_rewards

        initial_state = self._core.initial_state(self._batch_size)

        rnn_inputs = features
        core_outputs = unroll(self._core, initial_state, rnn_inputs)

        # Remove final timestep of outputs.
        core_outputs = nest.map_structure(lambda t: t[:-1], core_outputs)

        if self._with_reconstructions:
            recons = snt.BatchApply(self._decode)(core_outputs.z)
            recon_targets = nest.map_structure(lambda t: t[:-1], inputs)
            recon_loss, recon_logged_values = losses.reconstruction_losses(
                recons=recons,
                targets=recon_targets,
                image_cost=self._image_cost_weight / self._total_num_pixels,
                action_cost=1.,
                reward_cost=1.)
        else:
            recon_loss = tf.constant(0.0)
            recon_logged_values = dict()

        if core_outputs.read_info is not tuple():
            read_reg_loss, read_reg_logged_values = (
                losses.read_regularization_loss(
                    read_info=core_outputs.read_info,
                    strength_cost=self._read_strength_cost,
                    strength_tolerance=self._read_strength_tolerance,
                    strength_reg_mode='L1',
                    key_norm_cost=0.,
                    key_norm_tolerance=1.))
        else:
            read_reg_loss = tf.constant(0.0)
            read_reg_logged_values = dict()

        # Bootstrap value is at end of episode so is zero.
        bootstrap_value = tf.zeros(shape=(self._batch_size, ),
                                   dtype=tf.float32)

        discounts = self._gamma * tf.ones_like(rewards)

        a2c_loss, a2c_loss_extra = trfl.sequence_advantage_actor_critic_loss(
            policy_logits=core_outputs.policy,
            baseline_values=core_outputs.baseline,
            actions=actions,
            rewards=rewards,
            pcontinues=discounts,
            bootstrap_value=bootstrap_value,
            lambda_=self._gamma,
            entropy_cost=self._entropy_cost,
            baseline_cost=self._return_cost_weight,
            name='SequenceA2CLoss')

        a2c_loss = tf.reduce_mean(a2c_loss)  # Average over batch.

        total_loss = a2c_loss + recon_loss + read_reg_loss

        a2c_loss_logged_values = dict(
            pg_loss=tf.reduce_mean(a2c_loss_extra.policy_gradient_loss),
            baseline_loss=tf.reduce_mean(a2c_loss_extra.baseline_loss),
            entropy_loss=tf.reduce_mean(a2c_loss_extra.entropy_loss))
        agent_loss_log = losses.combine_logged_values(a2c_loss_logged_values,
                                                      recon_logged_values,
                                                      read_reg_logged_values)
        agent_loss_log['total_loss'] = total_loss

        return total_loss, agent_loss_log
예제 #10
0
    def __init__(self,
                 name,
                 obs_size=2,
                 action_size=2,
                 actor_hidden_size=32,
                 critic_hidden_size=32,
                 pow_learning_rate=0.001,
                 RB_learning_rate=0.001,
                 entropy_cost=0.01,
                 normalise_entropy=True,
                 lambda_=0.,
                 baseline_cost=1.):

        with tf.variable_scope(name):

            # define inputs placeholders for the networks
            self.name = name
            self.input_ = tf.placeholder(tf.float32, [None, obs_size],
                                         name='inputs')
            self.action_RB_ = tf.placeholder(tf.int32, [None, 1],
                                             name='action_RB')
            self.action_pow_ = tf.placeholder(tf.int32, [None, 1],
                                              name='action_pow')
            self.reward_ = tf.placeholder(tf.float32, [None, 1], name='reward')
            self.discount_ = tf.placeholder(tf.float32, [None, 1],
                                            name='discount')
            self.bootstrap_ = tf.placeholder(tf.float32, [None],
                                             name='bootstrap')

            # set up actor network
            self.fc1_actor_ = tf.contrib.layers.fully_connected(
                self.input_, actor_hidden_size, activation_fn=tf.nn.elu)
            self.fc2_actor_ = tf.contrib.layers.fully_connected(
                self.fc1_actor_, actor_hidden_size, activation_fn=tf.nn.elu)
            self.fc3_actor_ = tf.contrib.layers.fully_connected(
                self.fc2_actor_, actor_hidden_size, activation_fn=tf.nn.elu)

            self.fc4_actor_power_ = tf.contrib.layers.fully_connected(
                self.fc3_actor_, ch.D2D_tr_Power_levels, activation_fn=None)
            self.fc4_actor_RB_ = tf.contrib.layers.fully_connected(
                self.fc3_actor_, ch.N_CU, activation_fn=None)

            # reshape the policy logits
            self.policy_logits_RB_ = tf.reshape(self.fc4_actor_RB_,
                                                (-1, 1, ch.N_CU))
            self.policy_logits_power_ = tf.reshape(
                self.fc4_actor_power_, (-1, 1, ch.D2D_tr_Power_levels))

            # generate action probabilities for taking actions
            self.action_prob_power_ = tf.nn.softmax(self.fc4_actor_power_)
            self.action_prob_RB_ = tf.nn.softmax(self.fc4_actor_RB_)

            # set up critic network
            self.fc1_critic_ = tf.contrib.layers.fully_connected(
                self.input_, critic_hidden_size, activation_fn=tf.nn.elu)
            self.fc2_critic_ = tf.contrib.layers.fully_connected(
                self.fc1_critic_, critic_hidden_size, activation_fn=tf.nn.elu)
            self.baseline_ = tf.contrib.layers.fully_connected(
                self.fc2_critic_, 1, activation_fn=None)

            # Define Loss with TRFL
            self.seq_aac_return_pow_ = trfl.sequence_advantage_actor_critic_loss(
                self.policy_logits_power_,
                self.baseline_,
                self.action_pow_,
                self.reward_,
                self.discount_,
                self.bootstrap_,
                lambda_=lambda_,
                entropy_cost=entropy_cost,
                baseline_cost=baseline_cost,
                normalise_entropy=normalise_entropy)

            self.seq_aac_return_RB_ = trfl.sequence_advantage_actor_critic_loss(
                self.policy_logits_RB_,
                self.baseline_,
                self.action_RB_,
                self.reward_,
                self.discount_,
                self.bootstrap_,
                lambda_=lambda_,
                entropy_cost=entropy_cost,
                baseline_cost=baseline_cost,
                normalise_entropy=normalise_entropy)

            # Optimize the loss
            self.ac_loss_pow_ = tf.reduce_mean(self.seq_aac_return_pow_.loss)
            self.ac_loss_RB_ = tf.reduce_mean(self.seq_aac_return_RB_.loss)
            self.ac_optim_pow_ = tf.train.AdamOptimizer(
                learning_rate=pow_learning_rate).minimize(self.ac_loss_pow_)
            self.ac_optim_RB_ = tf.train.AdamOptimizer(
                learning_rate=RB_learning_rate).minimize(self.ac_loss_RB_)
예제 #11
0
    def __init__(self, name):
        with tf.variable_scope(name):
            self.name = name

            # Input images
            self.inputs_ = tf.placeholder(
                tf.float32,
                [None, state_size[0], state_size[1], state_size[2]],
                name='inputs')

            # One hot encode the actions:
            # [look_left, look_right, strafe_left, strafe_right, forward, backward]
            self.actions = tf.placeholder(tf.int32, [None, num_envs],
                                          name='actions')
            self.rewards = tf.placeholder(tf.float32, [None, num_envs],
                                          name='rewards')

            # Conv layers
            self.conv1 = tf.contrib.layers.conv2d(self.inputs_,
                                                  output_filters_conv1,
                                                  kernel_size=8,
                                                  stride=2)
            self.conv2 = tf.contrib.layers.conv2d(self.conv1,
                                                  output_filters_conv2,
                                                  kernel_size=4,
                                                  stride=2)
            self.conv3 = tf.contrib.layers.conv2d(self.conv2,
                                                  output_filters_conv3,
                                                  kernel_size=4,
                                                  stride=1)

            # Constructing input to AC network
            self.actions_input = tf.reshape(
                tf.one_hot(self.actions, action_size), [-1, action_size])
            self.rewards_input = tf.reshape(self.rewards, [-1, 1])
            self.vision_input = tf.reshape(self.conv3, [
                -1,
                self.conv3.shape[1] * self.conv3.shape[2] * self.conv3.shape[3]
            ])

            self.ac_input = tf.concat(
                [self.actions_input, self.rewards_input, self.vision_input],
                axis=1)

            # FC Layer
            self.fc1 = tf.contrib.layers.fully_connected(
                self.ac_input, hidden_size)

            # LSTM Layer
            self.lstm_cell = tf.nn.rnn_cell.LSTMCell(lstm_size,
                                                     state_is_tuple=False)
            self.lstm_hidden_state_input = tf.placeholder_with_default(
                self.lstm_cell.zero_state(batch_size=num_envs,
                                          dtype=tf.float32),
                [num_envs, hidden_size])
            # Should be lstm_size not hidden_size?

            self.lstm_input = tf.reshape(self.fc1, [-1, num_envs, hidden_size])

            # Dynamic RNN code - might not need to be dynamic
            self.lstm_output, self.lstm_hidden_state_output = tf.nn.dynamic_rnn(
                self.lstm_cell,
                self.lstm_input,
                initial_state=self.lstm_hidden_state_input,
                dtype=tf.float32,
                time_major=True,
                # parallel_iterations=num_envs, # Note: not sure what these do
                # swap_memory=True, # Note: not sure what these do
            )

            self.lstm_output_flat = tf.reshape(self.lstm_output,
                                               [-1, lstm_size])
            # TODO: rethink layer shapes?

            # Value function - Linear output layer
            self.value_output = tf.contrib.layers.fully_connected(
                self.lstm_output_flat, 1, activation_fn=None)

            # Policy - softmax output layer
            self.policy_logits = tf.contrib.layers.fully_connected(
                self.lstm_output_flat, action_size, activation_fn=None)
            self.policy_output = tf.contrib.layers.softmax(self.policy_logits)
            # Action sampling op
            self.action_output = tf.squeeze(tf.multinomial(
                logits=self.policy_logits, num_samples=1),
                                            axis=1)

            # Used for TRFL stuff
            self.value_output_unflat = tf.reshape(self.value_output,
                                                  [n, num_envs])
            self.policy_logits_unflat = tf.reshape(self.policy_logits,
                                                   [n, num_envs, -1])

            self.discounts = tf.placeholder(tf.float32, [n, num_envs],
                                            name="discounts")
            self.initial_Rs = tf.placeholder(tf.float32, [num_envs],
                                             name="initial_Rs")

            #TRFL loss
            a2c_loss, extra = trfl.sequence_advantage_actor_critic_loss(
                policy_logits=self.policy_logits_unflat,
                baseline_values=self.value_output_unflat,
                actions=self.actions,
                rewards=self.rewards,
                pcontinues=self.discounts,
                bootstrap_value=self.initial_Rs,
                entropy_cost=entropy_reg_term,
                normalise_entropy=normalise_entropy)
            self.loss = tf.reduce_mean(a2c_loss)
            self.extra = extra
            self.opt = tf.train.AdamOptimizer(learning_rate).minimize(
                self.loss)

            print("Network shapes:")
            print("inputs_: ", self.inputs_.shape)
            print("actions: ", self.actions.shape)
            print("conv1: ", self.conv1.shape)
            print("conv2: ", self.conv2.shape)
            print("conv3: ", self.conv3.shape)
            print("ac_input: ", self.ac_input.shape)
            print("fc1: ", self.fc1.shape)
            print("lstm_hidden_state_input: ",
                  self.lstm_hidden_state_input.shape)
            print("lstm_input: ", self.lstm_input.shape)
            print("lstm_hidden_state_output: ",
                  self.lstm_hidden_state_output.shape)
            print("lstm_output: ", self.lstm_output.shape)
            print("lstm_output_flat: ", self.lstm_output_flat.shape)
            print("value_output: ", self.value_output.shape)
            print("policy_logits: ", self.policy_logits.shape)
            print("policy_output: ", self.policy_output.shape)
            print("value_output_unflat: ", self.value_output_unflat.shape)
            print("policy_logits_unflat: ", self.policy_logits_unflat.shape)

            # Tensorboard
            self.average_reward_metric = tf.placeholder(tf.float32,
                                                        name="average_reward")
            # self.average_length_of_episode = tf.placeholder(tf.float32, name="average_length_of_episode")

            policy_summary = tf.summary.tensor_summary('policy',
                                                       self.policy_output)
            reward_summary = tf.summary.scalar('average_reward_metric',
                                               self.average_reward_metric)
            loss_summary = tf.summary.scalar('loss', self.loss)
            entropy_summary = tf.summary.scalar(
                'policy_entropy', tf.math.reduce_mean(self.extra.entropy))
            baseline_loss_summary = tf.summary.scalar(
                'baseline_loss', tf.math.reduce_mean(self.extra.baseline_loss))
            entropy_loss_summary = tf.summary.scalar(
                'entropy_loss', tf.math.reduce_mean(self.extra.entropy_loss))
            policy_gradient_loss = tf.summary.scalar(
                'policy_gradient_loss',
                tf.math.reduce_mean(self.extra.policy_gradient_loss))

            self.train_step_summary = tf.summary.merge([
                reward_summary, loss_summary, entropy_summary,
                baseline_loss_summary, entropy_loss_summary,
                policy_gradient_loss
            ])

            self.action_step_summary = tf.summary.merge([policy_summary])