示例#1
0
class PPO_Hierarchy(Method):
    def __init__(self,
                 sess,
                 settings,
                 netConfigOverride,
                 stateShape,
                 actionSize,
                 nTrajs=1,
                 **kwargs):
        """
        Initializes a training method for a neural network.

        Parameters
        ----------
        Model : Keras Model Object
            A Keras model object with fully defined layers and a call function. See examples in networks module.
        sess : Tensorflow Session
            Initialized Tensorflow session
        stateShape : list
            List of integers of the inputs shape size. Ex [39,39,6]
        actionSize : int
            Output size of the network.
        HPs : dict
            Dictionary that contains all hyperparameters to be used in the methods training
        nTrajs : int (Optional)
            Number that specifies the number of trajectories to be created for collecting training data.
        scope : str (Optional)
            Name of the PPO method. Used to group and differentiate variables between other networks.

        Returns
        -------
        N/A
        """
        #Processing inputs
        self.actionSize = actionSize
        self.sess = sess
        self.Model = NetworkBuilder(networkConfig=settings["NetworkConfig"],
                                    netConfigOverride=netConfigOverride,
                                    actionSize=actionSize)
        self.HPs = settings["HPs"]
        self.subReward = False
        self.UpdateSubpolicies = True
        self.nTrajs = nTrajs
        self.method = self.HPs["Method"]

        #Creating two buffers to separate information between the different levels of the network.
        if self.subReward:
            self.buffer = [Trajectory(depth=12) for _ in range(nTrajs)]
            #[s0,a,r,r_sub,s1,done]+[HL_actions, HL_log_logits, HL_v, flag, critics, logits]
        else:
            self.buffer = [Trajectory(depth=11) for _ in range(nTrajs)]
            #[s0,a,r,s1,done]+[HL_action, HL_log_logits, HL_v, flag, critics, logits]

        with self.sess.as_default(), self.sess.graph.as_default():
            with tf.name_scope(scope):
                #Generic placeholders
                self.s = tf.placeholder(tf.float32, [None] + stateShape, 'S')
                self.a_his = tf.placeholder(tf.int32, [
                    None,
                ], 'A')
                self.td_target_ = tf.placeholder(tf.float32, [None], 'Vtarget')
                self.advantage_ = tf.placeholder(shape=[None],
                                                 dtype=tf.float32,
                                                 name='adv_hold')

                #Initializing Netowrk I/O
                inputs = {"state": self.s}
                out = self.Model(inputs)
                self.a_prob = out["metaActor"]
                self.v = out["metaCritic"]
                self.log_logits = out["metaLogLogits"]

                self.sub_a_prob = out["subActor"]
                self.sub_log_logits = out["subLogLogits"]
                self.sub_v = out["subCritic"]

                self.nPolicies = len(self.sub_a_prob)

                #Placeholder for the Hierarchical Policy
                self.old_log_logits_ = tf.placeholder(
                    shape=[None, self.nPolicies],
                    dtype=tf.float32,
                    name='old_logit_hold')
                #Placeholder for the Sub-Policies
                self.old_log_logits_sub_ = tf.placeholder(
                    shape=[None, actionSize],
                    dtype=tf.float32,
                    name='old_logit_sub_hold')

                # Creating the Loss and update calls for the Hierarchical policy
                self.hierarchicalLoss = self.CreateLossPPO(
                    self.a_prob, self.td_target_, self.v, self.a_his,
                    self.log_logits, self.old_log_logits_, self.advantage_,
                    self.nPolicies)
                variables = self.Model.getHierarchyVariables()
                self.hierarchyUpdater = self.CreateUpdater(
                    self.hierarchicalLoss, variables)

                # Creating the Losses updaters for the Sub-policies.
                self.subpolicyLoss = []
                self.subpolicyUpdater = []
                for i in range(self.nPolicies):
                    loss = self.CreateLossPPO(self.sub_a_prob[i],
                                              self.td_target_, self.sub_v[i],
                                              self.a_his,
                                              self.sub_log_logits[i],
                                              self.old_log_logits_sub_,
                                              self.advantage_, self.actionSize)
                    self.subpolicyLoss.append(loss)
                    variables = self.Model.getSubpolicyVariables(i)
                    self.subpolicyUpdater.append(
                        self.CreateUpdater(loss, variables))

            #Creating Variables for teh purpose of logging.
            self.SubpolicyDistribution = MovingAverage(1000)

    def CreateUpdater(self, loss, variables):
        optimizer = tf.keras.optimizers.Adam(self.HPs["LR"])
        gradients = optimizer.get_gradients(loss, variables)
        return optimizer.apply_gradients(zip(gradients, variables))

    def CreateLossPPO(self, a_prob, td_target_, v, a_his, log_logits,
                      old_log_logits_, advantage_, actionSize):
        # Entropy
        entropy = -tf.reduce_mean(a_prob * _log(a_prob), name='entropy')

        # Critic Loss
        td_error = td_target_ - v
        critic_loss = tf.reduce_mean(tf.square(td_error), name='critic_loss')

        # Actor Loss
        action_OH = tf.one_hot(a_his, actionSize, dtype=tf.float32)
        log_prob = tf.reduce_sum(log_logits * action_OH, 1)
        old_log_prob = tf.reduce_sum(old_log_logits_ * action_OH, 1)

        # Clipped surrogate function
        ratio = tf.exp(log_prob - old_log_prob)
        surrogate = ratio * advantage_
        clipped_surrogate = tf.clip_by_value(ratio, 1 - self.HPs["eps"],
                                             1 + self.HPs["eps"]) * advantage_
        surrogate_loss = tf.minimum(surrogate,
                                    clipped_surrogate,
                                    name='surrogate_loss')
        actor_loss = -tf.reduce_mean(surrogate_loss, name='actor_loss')

        actor_loss = actor_loss - entropy * self.HPs["EntropyBeta"]
        loss = actor_loss + critic_loss * self.HPs["CriticBeta"]
        return loss

    def InitiateEpisode(self):
        if self.method == "Greedy":
            pass
        elif self.method == "Fixed Step":
            self.counter = 1
            self.nStep = 4

        elif self.method == "Constant":
            pass

        elif self.method == "Confidence":
            self.pastActions = [None] * self.nTrajs

        elif self.method == "Probabilistic Confidence":
            pass

        else:
            pass

    def GetAction(self, state, step, episode=0):
        """
        Method to run data through hierarchical network

        First run the state through the meta network to select subpolicy to use.
        Second run the state through the proper Subpolicy

        ToDo: Check if faster to run the entire network and select appropriate subpolicy afterwards. or run only the required bit.

        Parameters
        ----------
        state : np.array
            Data with the shape of [N, self.stateShape] where N is number of smaples

        Returns
        -------
        actions : list[int]
            List of actions based on NN output.
        extraData : list
            List of data that is passed to the execution code to be bundled with state data.
        """
        #Determine number of steps and whether to initiate confidence based on the length of the Buffer.
        if step == 0:
            self.InitiateEpisode()

        # Run the Meta and Sub-policy Networks
        targets = [self.a_prob, self.log_logits, self.v
                   ] + self.sub_a_prob + self.sub_log_logits + self.sub_v
        res = self.sess.run(targets, {self.s: state})
        LL_probs = res[0]
        HL_log_logits = res[1]
        HL_v = res[2]
        sub_probs = res[3:3 + self.nPolicies]
        sub_log_logits = res[3 + self.nPolicies:3 + 2 * self.nPolicies]
        sub_v = res[3 + 2 * self.nPolicies:]

        if self.method == "Greedy":
            HL_actions = np.array([
                np.random.choice(LL_probs.shape[1], p=prob / sum(prob))
                for prob in LL_probs
            ])
            flag = [True] * state.shape[0]
        elif self.method == "Fixed Step":
            if self.counter == self.nStep:
                #Reseting Step counter and selecting New option
                self.counter = 1
            if self.counter == 1:
                HL_actions = np.array([
                    np.random.choice(LL_probs.shape[1], p=prob / sum(prob))
                    for prob in LL_probs
                ])
                self.traj_action = HL_actions
                flag = [True] * state.shape[0]
            else:
                HL_actions = self.traj_action
                flag = [False] * state.shape[0]
            self.counter += 1

        elif self.method == "Confidence":
            flag = []
            HL_actions = []
            confids = -np.mean(LL_probs * np.log(LL_probs), axis=1)
            for i, confid in enumerate(confids):
                if confid < 0.1 or step == 0:
                    action = np.random.choice(LL_probs.shape[1],
                                              p=LL_probs[i] / sum(LL_probs[i]))
                    HL_actions.append(action)
                    self.pastActions[i] = action
                    flag.append(True)
                else:
                    HL_actions.append(self.pastActions[i])
                    flag.append(True)
            self.traj_action = HL_actions

        elif self.method == "Probabilistic Confidence":
            pass
        else:
            pass

        # Run the Subpolicy Network
        actions = np.array([
            np.random.choice(self.actionSize,
                             p=sub_probs[mod][idx] / sum(sub_probs[mod][idx]))
            for idx, mod in enumerate(HL_actions)
        ])
        critics = [sub_v[mod][idx] for idx, mod in enumerate(HL_actions)]
        logits = [
            sub_log_logits[mod][idx] for idx, mod in enumerate(HL_actions)
        ]

        return actions, [
            HL_actions, HL_log_logits, HL_v, flag, critics, logits
        ]

    def Update(self, HPs):
        """
        Process the buffer and backpropagates the loses through the NN.

        Parameters
        ----------
        HPs : dict
            Hyperparameters for training.

        Returns
        -------
        N/A
        """
        samples = 0
        for i in range(len(self.buffer)):
            samples += len(self.buffer[i])
        if samples < self.HPs["BatchSize"]:
            return

        for traj in range(len(self.buffer)):

            td_target, advantage, td_target_hier, advantage_hier, actions_hier, ll_hier, s_hier = self.ProcessBuffer(
                HPs, traj)
            # Updating the Hierarchical Controller
            for epoch in range(self.HPs["Epochs"]):
                for batch in MultiBatchDivider([
                        s_hier, actions_hier, td_target_hier, advantage_hier,
                        ll_hier
                ], self.HPs["MinibatchSize"]):

                    feed_dict = {
                        self.s: np.asarray(batch[0]).squeeze(),
                        self.a_his: np.asarray(batch[1]).squeeze(),
                        self.td_target_: np.asarray(batch[2]).squeeze(),
                        self.advantage_: np.reshape(batch[3], [-1]),
                        self.old_log_logits_: np.asarray(batch[4]).squeeze()
                    }
                    self.sess.run(self.hierarchyUpdater, feed_dict)

            if self.UpdateSubpolicies:
                #Collecting the data into different sub-Policies
                if self.subReward:
                    tmp, l1, l2, l3, l4, l5 = (
                        list(t) for t in zip(
                            *sorted(zip(self.buffer[traj][6], self.buffer[traj]
                                        [0], self.buffer[traj][1], td_target,
                                        advantage, self.buffer[traj][10]),
                                    key=lambda x: x[0]))
                    )  #Sorting by the value in the actions_hier
                    #dividing at the splits
                    for subpolicyNum, data in SubpolicyIterator(
                            tmp, [l1, l2, l3, l4, l5]):
                        #Updating each of the sub-policies.
                        for epoch in range(self.HPs["Epochs"]):
                            for batch in MultiBatchDivider(
                                    data, self.HPs["MinibatchSize"]):

                                feed_dict = {
                                    self.s:
                                    np.asarray(batch[0]).squeeze(),
                                    self.a_his:
                                    np.asarray(batch[1]).squeeze(),
                                    self.td_target_:
                                    np.asarray(batch[2]).squeeze(),
                                    self.advantage_:
                                    np.reshape(batch[3], [-1]),
                                    self.old_log_logits_sub_:
                                    np.asarray(batch[4]).squeeze()
                                }
                                self.sess.run(
                                    self.subpolicyUpdater[subpolicyNum],
                                    feed_dict)
                    self.SubpolicyDistribution.extend(
                        np.asarray(self.buffer[traj][6]))
                else:
                    tmp, l1, l2, l3, l4, l5 = (
                        list(t) for t in zip(
                            *sorted(zip(self.buffer[traj][5], self.buffer[traj]
                                        [0], self.buffer[traj][1], td_target,
                                        advantage, self.buffer[traj][10]),
                                    key=lambda x: x[0]))
                    )  #Sorting by the value in the actions_hier
                    #dividing at the splits
                    for subpolicyNum, data in SubpolicyIterator(
                            tmp, [l1, l2, l3, l4, l5]):
                        #Updating each of the sub-policies.
                        for epoch in range(self.HPs["Epochs"]):
                            for batch in MultiBatchDivider(
                                    data, self.HPs["MinibatchSize"]):

                                feed_dict = {
                                    self.s:
                                    np.asarray(batch[0]).squeeze(),
                                    self.a_his:
                                    np.asarray(batch[1]).squeeze(),
                                    self.td_target_:
                                    np.asarray(batch[2]).squeeze(),
                                    self.advantage_:
                                    np.reshape(batch[3], [-1]),
                                    self.old_log_logits_sub_:
                                    np.asarray(batch[4]).squeeze()
                                }
                                self.sess.run(
                                    self.subpolicyUpdater[subpolicyNum],
                                    feed_dict)
                    self.SubpolicyDistribution.extend(
                        np.asarray(self.buffer[traj][5]))

            self.ClearTrajectory()

    def GetStatistics(self):
        stats = {}
        for i in range(self.nPolicies):
            length = len(self.SubpolicyDistribution.tolist())
            if length == 0:
                length = 1
            stats[
                "Subpolicy Use/" +
                str(i)] = self.SubpolicyDistribution.tolist().count(i) / length
        return stats

    def ProcessBuffer(self, HPs, traj):
        """
        Process the buffer and backpropagates the loses through the NN.

        Parameters
        ----------
        Model : HPs
            Hyperparameters for training.
        traj : Trajectory
            Data stored by the neural network.
        clip : list[bool]
            List where the trajectory has finished.

        Returns
        -------
        td_target : list
            List Temporal Difference Target for particular states.
        advantage : list
            List of advantages for particular actions.
        """
        #Splitting the buffer into different episodes based on the done tag.
        split_loc = [i + 1 for i, x in enumerate(self.buffer[traj][4]) if x]
        if self.subReward:
            #Stuff need to be processed for the Low Level Controllers
            reward_lists = np.split(self.buffer[traj][2], split_loc[:-1])
            sub_reward_lists = np.split(self.buffer[traj][3], split_loc[:-1])
            value_lists = np.split(self.buffer[traj][10], split_loc[:-1])

            #Stuff needed for the
            HL_S_lists = np.split(self.buffer[traj][0], split_loc[:-1])
            HL_Critic_lists = np.split(self.buffer[traj][8], split_loc[:-1])
            HL_Logits_lists = np.split(self.buffer[traj][7], split_loc[:-1])
            HL_action_lists = np.split(self.buffer[traj][6], split_loc[:-1])
            HL_flag_lists = np.split(self.buffer[traj][9], split_loc[:-1])

            td_target = []
            advantage = []
            td_target_hier = []
            advantage_hier = []
            ll = []
            actions = []

            for rew, s_rew, value, HL_critic, HL_ll, HL_a, HL_flag, HL_s in zip(
                    reward_lists, sub_reward_lists, value_lists,
                    HL_Critic_lists, HL_Logits_lists, HL_action_lists,
                    HL_flag_lists, HL_S_lists):
                # Calculating the per step advantage of each of the different sections
                td_target_i, advantage_i = gae(
                    s_rew.reshape(-1).tolist(),
                    value.reshape(-1).tolist(), 0, self.HPs["Gamma"],
                    self.HPs["lambda"])
                td_target.extend(td_target_i)
                advantage.extend(advantage_i)
                #Colapsing different trajectory lengths for the hierarchical controller
                split_loc_ = [i + 1 for i, x in enumerate(HL_flag[:-1]) if x]
                rew_hier = [np.sum(l) for l in np.split(rew, split_loc_)]
                value_hier = [l[0] for l in np.split(HL_critic, split_loc_)]
                actions.extend([l[0] for l in np.split(HL_a, split_loc_)])
                ll.extend([l[0] for l in np.split(HL_ll, split_loc_)])
                s.extend([l[0] for l in np.split(HL_s, split_loc_)])
                #Calculating the td_target and advantage for the hierarchical controller.
                td_target_i_, advantage_i_ = gae(
                    np.asarray(rew_hier).reshape(-1).tolist(),
                    np.asarray(value_hier).reshape(-1).tolist(), 0,
                    self.HPs["Gamma"], self.HPs["lambda"])
                td_target_hier.extend(td_target_i_)
                advantage_hier.extend(advantage_i_)

            return td_target, advantage, td_target_hier, advantage_hier, actions, ll
        else:

            #Stuff need to be processed for the Low Level Controllers
            reward_lists = np.split(self.buffer[traj][2], split_loc[:-1])
            value_lists = np.split(self.buffer[traj][9], split_loc[:-1])

            #Stuff needed for the
            HL_S_lists = np.split(self.buffer[traj][0], split_loc[:-1])
            HL_Critic_lists = np.split(self.buffer[traj][7], split_loc[:-1])
            HL_Logits_lists = np.split(self.buffer[traj][6], split_loc[:-1])
            HL_action_lists = np.split(self.buffer[traj][5], split_loc[:-1])
            HL_flag_lists = np.split(self.buffer[traj][8], split_loc[:-1])

            td_target = []
            advantage = []
            td_target_hier = []
            advantage_hier = []
            ll = []
            actions = []
            s = []

            for rew, value, HL_critic, HL_ll, HL_a, HL_flag, HL_s in zip(
                    reward_lists, value_lists, HL_Critic_lists,
                    HL_Logits_lists, HL_action_lists, HL_flag_lists,
                    HL_S_lists):
                # Calculating the per step advantage of each of the different sections
                td_target_i, advantage_i = gae(
                    rew.reshape(-1).tolist(),
                    value.reshape(-1).tolist(), 0, self.HPs["Gamma"],
                    self.HPs["lambda"])
                td_target.extend(td_target_i)
                advantage.extend(advantage_i)
                #Colapsing different trajectory lengths for the hierarchical controller
                split_loc_ = [i + 1 for i, x in enumerate(HL_flag[:-1]) if x]
                rew_hier = [np.sum(l) for l in np.split(rew, split_loc_)]
                value_hier = [l[0] for l in np.split(HL_critic, split_loc_)]
                actions.extend([l[0] for l in np.split(HL_a, split_loc_)])
                ll.extend([l[0] for l in np.split(HL_ll, split_loc_)])
                s.extend([l[0] for l in np.split(HL_s, split_loc_)])
                #Calculating the td_target and advantage for the hierarchical controller.
                td_target_i_, advantage_i_ = gae(
                    np.asarray(rew_hier).reshape(-1).tolist(),
                    np.asarray(value_hier).reshape(-1).tolist(), 0,
                    self.HPs["Gamma"], self.HPs["lambda"])
                td_target_hier.extend(td_target_i_)
                advantage_hier.extend(advantage_i_)

            return td_target, advantage, td_target_hier, advantage_hier, actions, ll, s

    @property
    def getVars(self):
        return self.Model.getVars("PPO_Training")
示例#2
0
class OptionCritic(Method):
    def __init__(self,
                 sess,
                 settings,
                 netConfigOverride,
                 stateShape,
                 actionSize,
                 nTrajs=1,
                 **kwargs):
        """
        Initializes a training method for a neural network.

        Parameters
        ----------
        Model : Keras Model Object
            A Keras model object with fully defined layers and a call function. See examples in networks module.
        sess : Tensorflow Session
            Initialized Tensorflow session
        stateShape : list
            List of integers of the inputs shape size. Ex [39,39,6]
        actionSize : int
            Output size of the network.
        HPs : dict
            Dictionary that contains all hyperparameters to be used in the methods training
        nTrajs : int (Optional)
            Number that specifies the number of trajectories to be created for collecting training data.
        scope : str (Optional)
            Name of the PPO method. Used to group and differentiate variables between other networks.

        Returns
        -------
        N/A
        """
        #Processing inputs
        self.actionSize = actionSize
        self.sess = sess
        self.Model = NetworkBuilder(networkConfig=settings["NetworkConfig"],
                                    netConfigOverride=netConfigOverride,
                                    actionSize=actionSize)
        self.method = "Confidence"  #Create input for this.
        self.HPs = settings["NetworkHPs"]
        self.subReward = False
        self.UpdateSubpolicies = True
        self.nTrajs = nTrajs

        #Creating buffer
        self.buffer = [Trajectory(depth=7) for _ in range(nTrajs)]
        #[s0,a,r,s1,done]+[HL_action]

        with self.sess.as_default(), self.sess.graph.as_default():
            with tf.name_scope("OptionCritic"):
                #Generic placeholders
                self.batch_size = tf.placeholder(tf.int32, 1, 'BS')
                self.s = tf.placeholder(tf.float32, [None] + stateShape, 'S')
                self.actions = tf.placeholder(tf.int32, [
                    None,
                ], 'A')
                self.rewards = tf.placeholder(tf.float32, [None], 'R')
                # self.advantage_ = tf.placeholder(shape=[None], dtype=tf.float32, name='adv_hold')
                self.options = tf.placeholder(shape=[None],
                                              dtype=tf.int32,
                                              name="options")

                batch_indexer = tf.range(tf.reshape(self.batch_size, []))

                #Initializing Netowrk I/O
                inputs = {"state": self.s}
                out = self.Model(inputs)
                self.term = out["metaTermination"]
                self.q = out["metaCritic"]

                self.sub_a_prob = out["subActor"]
                self.sub_log_logits = out["subLogLogits"]

                self.nPolicies = len(self.sub_a_prob)

                # Creating the Loss and update calls for the Hierarchical policy
                # Indexers
                self.responsible_options = tf.stack(
                    [batch_indexer, self.options], axis=1)
                self.responsible_actions = tf.stack(
                    [batch_indexer, self.actions], axis=1)
                self.network_indexer = tf.stack([self.options, batch_indexer],
                                                axis=1)

                # Q Values OVER options
                self.disconnected_q_vals = tf.stop_gradient(self.q)

                # Q values of each option that was taken
                self.responsible_opt_q_vals = tf.gather_nd(
                    params=self.q, indices=self.responsible_options
                )  # Extract q values for each option
                self.disconnected_q_vals_option = tf.gather_nd(
                    params=self.disconnected_q_vals,
                    indices=self.responsible_options)

                # Termination probability of each option that was taken
                self.terminations = tf.gather_nd(
                    params=self.term, indices=self.responsible_options)

                # Q values for each action that was taken
                relevant_networks = tf.gather_nd(params=self.sub_a_prob,
                                                 indices=self.network_indexer)
                relevant_networks = tf.nn.softmax(relevant_networks, dim=1)

                self.action_values = tf.gather_nd(
                    params=relevant_networks, indices=self.responsible_actions)

                # Weighted average value
                option_eps = 0.001
                self.value = tf.reduce_max(self.q) * (1 - option_eps) + (
                    option_eps * tf.reduce_mean(self.q))
                disconnected_value = tf.stop_gradient(self.value)

                # Losses; TODO: Why reduce sum vs reduce mean?
                vf_coef = 0.5
                self.value_loss = vf_coef * tf.reduce_mean(
                    vf_coef * 0.5 *
                    tf.square(self.rewards - self.responsible_opt_q_vals))
                self.policy_loss = tf.reduce_mean(
                    _log(self.action_values) *
                    (self.rewards - self.disconnected_q_vals_option))
                self.deliberation_costs = 0.020
                self.termination_loss = tf.reduce_mean(
                    self.terminations *
                    ((self.disconnected_q_vals_option - disconnected_value) +
                     self.deliberation_costs))

                ent_coef = 0.01
                action_probabilities = self.sub_a_prob
                self.entropy = ent_coef * tf.reduce_mean(
                    action_probabilities * _log(action_probabilities))

                self.loss = -self.policy_loss - self.entropy - self.value_loss - self.termination_loss

                variables = self.Model.getVars()
                variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                              "OptionCritic")
                optimizer = tf.keras.optimizers.Adam(self.HPs["LR"])
                gradients = optimizer.get_gradients(self.loss, variables)
                self.update_op = optimizer.apply_gradients(
                    zip(gradients, variables))

            #Creating Variables for teh purpose of logging.
            self.SubpolicyDistribution = MovingAverage(1000)

    def GetAction(self, state, step, episode=0):
        """
        Method to run data through hierarchical network

        First run the state through the meta network to select subpolicy to use.
        Second run the state through the proper Subpolicy

        ToDo: Check if faster to run the entire network and select appropriate subpolicy afterwards. or run only the required bit.

        Parameters
        ----------
        state : np.array
            Data with the shape of [N, self.stateShape] where N is number of smaples

        Returns
        -------
        actions : list[int]
            List of actions based on NN output.
        extraData : list
            List of data that is passed to the execution code to be bundled with state data.
        """
        #Determine number of steps and whether to initiate confidence based on the length of the Buffer.
        if step == 0:
            self.pastActions = [None] * self.nTrajs

        # Run the Meta and Sub-policy Networks
        targets = [self.q, self.term] + self.sub_a_prob + self.sub_log_logits
        res = self.sess.run(targets, {self.s: state})
        q = res[0]
        terminations = res[1]
        sub_probs = res[2:3 + self.nPolicies]
        sub_log_logits = res[2 + self.nPolicies:2 + 2 * self.nPolicies]
        HL_actions = []
        for i, term in enumerate(terminations):
            if step == 0:
                action = np.argmax(q[i])
                HL_actions.append(action)
                self.pastActions[i] = action
            elif random.uniform(0, 1) < term[self.pastActions[i]]:
                # action = np.argmax(q[i])
                action = random.randint(0, 2)
                HL_actions.append(action)
                self.pastActions[i] = action
            else:
                action = random.randint(0, 2)
                HL_actions.append(action)
                # HL_actions.append(self.pastActions[i])
        self.traj_action = HL_actions
        print(q, HL_actions)

        # Run the Subpolicy Network
        actions = np.array([
            np.random.choice(self.actionSize,
                             p=sub_probs[mod][idx] / sum(sub_probs[mod][idx]))
            for idx, mod in enumerate(HL_actions)
        ])
        logits = [
            sub_log_logits[mod][idx] for idx, mod in enumerate(HL_actions)
        ]

        return actions, [HL_actions, q]

    def Update(self, HPs):
        """
        Process the buffer and backpropagates the loses through the NN.

        Parameters
        ----------
        HPs : dict
            Hyperparameters for training.

        Returns
        -------
        N/A
        """
        samples = 0
        for i in range(len(self.buffer)):
            samples += len(self.buffer[i])
        if samples < self.HPs["BatchSize"]:
            return

        for traj in range(len(self.buffer)):
            advantage = self.ProcessBuffer(traj)
            # Updating the Hierarchical Controller
            for epoch in range(self.HPs["Epochs"]):
                for batch in MultiBatchDivider([
                        self.buffer[traj][0], self.buffer[traj][1], advantage,
                        self.buffer[traj][5]
                ], self.HPs["MinibatchSize"]):

                    feed_dict = {
                        self.batch_size:
                        [np.asarray(batch[0]).squeeze().shape[0]],
                        self.s: np.asarray(batch[0]).squeeze(),
                        self.actions: np.asarray(batch[1]).squeeze(),
                        self.rewards: np.asarray(batch[2]).squeeze(),
                        self.options: np.reshape(batch[3], [-1])
                    }
                    self.sess.run(self.update_op, feed_dict)
            self.SubpolicyDistribution.extend(np.asarray(self.buffer[traj][5]))
            self.ClearTrajectory()

    def GetStatistics(self):
        stats = {}
        for i in range(self.nPolicies):
            length = len(self.SubpolicyDistribution.tolist())
            if length == 0:
                length = 1
            stats[
                "Subpolicy Use/" +
                str(i)] = self.SubpolicyDistribution.tolist().count(i) / length
        return stats

    def ProcessBuffer(self, traj):
        """
        Process the buffer and backpropagates the loses through the NN.

        Parameters
        ----------
        Model : HPs
            Hyperparameters for training.
        traj : Trajectory
            Data stored by the neural network.
        clip : list[bool]
            List where the trajectory has finished.

        Returns
        -------
        td_target : list
            List Temporal Difference Target for particular states.
        advantage : list
            List of advantages for particular actions.
        """
        #Splitting the buffer into different episodes based on the done tag.
        split_loc = [i + 1 for i, x in enumerate(self.buffer[traj][4]) if x]
        #Stuff need to be processed for the Low Level Controllers
        reward_lists = np.split(self.buffer[traj][2], split_loc[:-1])
        value_lists = np.split(self.buffer[traj][6], split_loc[:-1])

        HL_action_lists = np.split(self.buffer[traj][5], split_loc[:-1])

        td_target = []
        advantage = []

        for rew, value, options in zip(reward_lists, value_lists,
                                       HL_action_lists):
            # Calculating the per step advantage of each of the different sections
            val = []
            for i, option in enumerate(options):
                val.append(value[i, 0, option])
            td_target_i, advantage_i = gae(
                rew.reshape(-1).tolist(),
                np.asarray(val).reshape(-1).tolist(), 0, self.HPs["Gamma"],
                self.HPs["lambda"])
            td_target.extend(td_target_i)
            advantage.extend(advantage_i)

        return advantage

    @property
    def getVars(self):
        return self.Model.getVars("PPO_Training")