示例#1
0
    def __init__(self,
                 learning_rates=[0.05, 0.00025],
                 state_sizes=[0, 0],
                 subgoals=None,
                 num_subgoals=0,
                 num_primitive_actions=0,
                 meta_controller_state_fn=None,
                 check_subgoal_fn=None,
                 load=True):
        """Initializes a hierarchical DQN agent.

           Args:
            learning_rates: learning rates of the meta-controller and controller agents.
            state_sizes: state sizes of the meta-controller and controller agents.
                         State sizes are assumed to be 1-dimensional.
            subgoals: array of subgoals for the meta-controller.
            num_subgoals: the action space of the meta-controller.
            num_primitive_actions: the action space of the controller.
            meta_controller_state_fn: function that returns the state of the meta-controller.
            check_subgoal_fn: function that checks if agent has satisfied a particular subgoal.
        """

        subgoals = np.array(subgoals)

        self.meta_path = "weights/meta/model"
        self.control_path = "weights/control/model"

        meta, control = None, None

        if load:
            meta = self.meta_path
            control = self.control_path

        self._meta_controller = DqnAgent(state_dims=state_sizes,
                                         num_actions=subgoals.shape[0],
                                         learning_rate=learning_rates[0],
                                         epsilon_end=0.01,
                                         file_path=meta)

        self._controller = DqnAgent(
            learning_rate=learning_rates[1],
            num_actions=num_primitive_actions,
            state_dims=[state_sizes[0] + subgoals.shape[1]],
            epsilon_end=0.01,
            file_path=control)

        self._subgoals = subgoals
        self._num_subgoals = num_subgoals

        self._meta_controller_state_fn = meta_controller_state_fn
        self._check_subgoal_fn = check_subgoal_fn

        self._meta_controller_state = None
        self._curr_subgoal = None
        self._meta_controller_reward = 0
        self._intrinsic_time_step = 0
        self._episode = 0
示例#2
0
    def __init__(self, 
        num_agents,
        state_size,
        num_actions,
        num_communication_turns):
        """Initializes a multi-agent RL agent. Used as a baseline in the FCRL experiments.
        Args:
            num_agents: Number of agents / controllers.
            state_size: The base size of each agent's state.
            num_actions: The number of actions. 
            num_communication_turns: Number of turns for which the agents communicate.
        """

        self._num_agents = num_agents
        self._num_actions = num_actions
        self._num_communication_turns = num_communication_turns
        self._base_state_size = state_size
        self._state_size = state_size + self._num_communication_turns * self._num_actions

        # Initialize the agents.
        self._agents = []
        for i in xrange(self._num_agents):
            self._agents.append(
                DqnAgent(state_dims=self._state_size, 
                    num_actions=self._num_actions,
                    learning_rate=0.1,
                    epsilon_end=0.01))

        self._communication_states = None
    def __init__(self,
                 learning_rates=[0.1, 0.00025],
                 state_sizes=[0, 0],
                 subgoals=None,
                 num_subgoals=0,
                 num_primitive_actions=0,
                 meta_controller_state_fn=None,
                 check_subgoal_fn=None):
        """Initializes a hierarchical DQN agent.

           Args:
            learning_rates: learning rates of the meta-controller and controller agents.
            state_sizes: state sizes of the meta-controller and controller agents.
                         State sizes are assumed to be 1-dimensional.
            subgoals: array of subgoals for the meta-controller.
            num_subgoals: the action space of the meta-controller.
            num_primitive_actions: the action space of the controller.
            meta_controller_state_fn: function that returns the state of the meta-controller.
            check_subgoal_fn: function that checks if agent has satisfied a particular subgoal.
        """

        self._meta_controller = DqnAgent(state_dims=state_sizes[0],
                                         num_actions=num_subgoals,
                                         learning_rate=learning_rates[0],
                                         epsilon_end=0.01)

        self._controller = DqnAgent(learning_rate=learning_rates[1],
                                    num_actions=num_primitive_actions,
                                    state_dims=[state_sizes[1] + num_subgoals],
                                    epsilon_end=0.01)

        self._subgoals = subgoals
        self._num_subgoals = num_subgoals

        self._meta_controller_state_fn = meta_controller_state_fn
        self._check_subgoal_fn = check_subgoal_fn

        self._meta_controller_state = None
        self._curr_subgoal = None
        self._meta_controller_reward = 0
        self._intrinsic_time_step = 0
        self._episode = 0
        self._original_state = None  # ndrw
示例#4
0
 def create_controllers(self,tsc):
     if tsc == 'uniform':
         for id in self.tl_ids:
             c = UniformController(self.conn, id, self.netdata, self.mode,self.red_t,self.yellow_t,self.green_t)
             self.Controllers[id] = c
     elif tsc == 'dqn':
         for id in self.tl_ids:
             # density and queue and current phase, +1 for all-red clearance pahse.
             state_size = len(self.netdata['inter'][id]['incoming_lanes'])*2 + len(self.netdata['inter'][id]['green_phases']) + 1
             action_size = len(self.netdata['inter'][id]['green_phases'])
             print(state_size, action_size)
             rlagent = DqnAgent(self.args.batch_size,state_size,action_size)
             c = DqnController(self.conn, id, self.netdata, self.mode, rlagent, self.green_t, self.yellow_t, self.red_t)
             self.Controllers[id] = c
示例#5
0
    def __init__(self,
                 learning_rates=[0.1, 0.00025],
                 state_sizes=[0, 0],
                 constraints=None,
                 num_constraints=0,
                 num_primitive_actions=0,
                 num_controllers=0,
                 num_controllers_per_subtask=0,
                 num_communication_turns=0,
                 critic_fn=None,
                 controller_subset_fn=None):
        """Initializes a FCRL agent.
           Args:
            learning_rates: learning rates of the meta-controller and controller agents.
            state_sizes: state sizes of the meta-controller and controller agents.
                         State sizes are assumed to be 1-dimensional.
            constraints: array of constraints for the meta-controller, which defines its action space.
            num_constraints: number of actions for the meta-controller.
            num_primitive_actions: number of actions for the controller.
            num_controllers: total number of controllers. 
            num_controllers_per_subtask: the number of controllers that coordinate to complete a given subtask.
            num_communication_turns: the number of turns for which controllers communicate.
            critic_fn: a custom critic function for a particular environment.
            controller_subset_fn: a custom function that returns the next controller subset.
        """
        self._meta_controller_state_size = state_sizes[0]

        self._num_controllers = num_controllers
        # Number of controllers that communicate to complete a subtask.
        self._num_controllers_per_subtask = num_controllers_per_subtask

        # A controller's state size is the input state size (the environment state)
        # + the ordering vector size (num_controllers_per_subtask)
        # + the communication vectors from the communication rounds and output round
        # (num_communication_turns * num_primitive_actions).
        self._controller_state_size = state_sizes[1]
        self._controller_state_size += self._num_controllers_per_subtask
        self._controller_state_size += num_communication_turns * num_primitive_actions

        self._meta_controller = DqnAgent(state_dims=state_sizes[0],
                                         num_actions=num_constraints,
                                         learning_rate=learning_rates[0],
                                         epsilon_end=0.01)

        self._controller = DqnAgent(learning_rate=learning_rates[1],
                                    num_actions=num_primitive_actions,
                                    state_dims=[self._controller_state_size],
                                    epsilon_end=0.01)

        self._constraints = constraints
        self._num_constraints = num_constraints
        self._num_primitive_actions = num_primitive_actions
        self._num_communication_turns = num_communication_turns
        self._critic_fn = critic_fn
        self._controller_subset_fn = controller_subset_fn

        self._intrinsic_time_step = 0
        self._episode = 0

        # Book-keeping variables.
        # Keeps track of the current meta-controller state.
        self._meta_controller_state = None
        # Keeps track of the current action selected by the meta-controller.
        self._curr_constraint = None
        # Keeps track of the meta-controller's reward for the current meta-controller time step.
        self._meta_controller_reward = 0

        # Keeps track of the constraints tried for current controller subset.
        self._tried_constraints = self.reset_tried_constraints()
        # Keeps track of controllers who have completed coordination in the current episode.
        self._done_controllers = []
示例#6
0
class FederatedControlAgent(object):
    def __init__(self,
                 learning_rates=[0.1, 0.00025],
                 state_sizes=[0, 0],
                 constraints=None,
                 num_constraints=0,
                 num_primitive_actions=0,
                 num_controllers=0,
                 num_controllers_per_subtask=0,
                 num_communication_turns=0,
                 critic_fn=None,
                 controller_subset_fn=None):
        """Initializes a FCRL agent.
           Args:
            learning_rates: learning rates of the meta-controller and controller agents.
            state_sizes: state sizes of the meta-controller and controller agents.
                         State sizes are assumed to be 1-dimensional.
            constraints: array of constraints for the meta-controller, which defines its action space.
            num_constraints: number of actions for the meta-controller.
            num_primitive_actions: number of actions for the controller.
            num_controllers: total number of controllers. 
            num_controllers_per_subtask: the number of controllers that coordinate to complete a given subtask.
            num_communication_turns: the number of turns for which controllers communicate.
            critic_fn: a custom critic function for a particular environment.
            controller_subset_fn: a custom function that returns the next controller subset.
        """
        self._meta_controller_state_size = state_sizes[0]

        self._num_controllers = num_controllers
        # Number of controllers that communicate to complete a subtask.
        self._num_controllers_per_subtask = num_controllers_per_subtask

        # A controller's state size is the input state size (the environment state)
        # + the ordering vector size (num_controllers_per_subtask)
        # + the communication vectors from the communication rounds and output round
        # (num_communication_turns * num_primitive_actions).
        self._controller_state_size = state_sizes[1]
        self._controller_state_size += self._num_controllers_per_subtask
        self._controller_state_size += num_communication_turns * num_primitive_actions

        self._meta_controller = DqnAgent(state_dims=state_sizes[0],
                                         num_actions=num_constraints,
                                         learning_rate=learning_rates[0],
                                         epsilon_end=0.01)

        self._controller = DqnAgent(learning_rate=learning_rates[1],
                                    num_actions=num_primitive_actions,
                                    state_dims=[self._controller_state_size],
                                    epsilon_end=0.01)

        self._constraints = constraints
        self._num_constraints = num_constraints
        self._num_primitive_actions = num_primitive_actions
        self._num_communication_turns = num_communication_turns
        self._critic_fn = critic_fn
        self._controller_subset_fn = controller_subset_fn

        self._intrinsic_time_step = 0
        self._episode = 0

        # Book-keeping variables.
        # Keeps track of the current meta-controller state.
        self._meta_controller_state = None
        # Keeps track of the current action selected by the meta-controller.
        self._curr_constraint = None
        # Keeps track of the meta-controller's reward for the current meta-controller time step.
        self._meta_controller_reward = 0

        # Keeps track of the constraints tried for current controller subset.
        self._tried_constraints = self.reset_tried_constraints()
        # Keeps track of controllers who have completed coordination in the current episode.
        self._done_controllers = []

    def reset_tried_constraints(self):
        return np.zeros(self._num_constraints)

    def get_meta_controller_state(self):
        """
        Returns the meta-controller state.
        Concatenatates vector representation of the largest selected primitive action 
        with the tried constraints vector.
        """
        state = np.zeros(self._num_primitive_actions)

        if len(self._selected_primitive_actions):
            selected_primitive_actions = np.array(
                self._selected_primitive_actions)
            max_primtive_action = np.max(selected_primitive_actions)
            state[max_primtive_action] = 1
        state = np.concatenate((state, np.copy(self._tried_constraints)),
                               axis=0)

        return state

    def get_controller_environment_states(env_state):
        """Returns an array of controller environment states."""
        controller_environment_states = np.split(env_state,
                                                 self._num_controllers)
        return controller_environment_states

    def get_controller_state(self,
                             env_state,
                             constraint,
                             ordering,
                             comm_turn,
                             communication_vector=None):
        """
        Returns the controller state containing the controller's environment state, 
        constraint, ordering vector, and received communication vectors.

        Args:
            env_state: The environment state for the current controller.
            constraint: The constraint provided to the current controller.
            ordering: The current controller's position vector in the overall ordering.
            communication_vector: communication received from other controllers in the current communication turn.
        """
        controller_state = np.zeros(self._controller_state_size)

        # Apply the constraint to the environment state.
        env_state_plus_constraint = np.logical_and(env_state,
                                                   constraint).astype(int)
        env_state_size = np.size(env_state_plus_constraint)

        controller_state[0:env_state_size] = env_state_plus_constraint
        controller_state[env_state_size:env_state_size_size +
                         self._num_controllers_per_subtask] = ordering

        if comm_turn >= 1:
            controller_state[(
                env_state_size + self._num_controllers_per_subtask +
                (comm_turn - 1) * num_primitive_actions):(
                    env_state_size + self._num_controllers_per_subtask +
                    comm_turn * num_primitive_actions)] = communication_vector

        return np.copy(controller_state)

    def intrinsic_reward(self, env_states, constraints, orderings,
                         selected_actions):
        """Intrinsically rewards a subset of controllers using the provided critic function."""
        return self._critic_fn(controller_states, constraints, orderings,
                               selected_actions)

    def construct_orderings(self):
        orderings = []
        for i in xrange(np.size(self._num_controllers_per_subtask)):
            ordering = np.zeros(self._num_controllers_per_subtask)
            ordering[i] = 1
            orderings.append(ordering)
        return orderings

    def controller_bookkeeping_vars(self):
        """
        Returns initilizations for controller states, actions, communications, and outputs.
        """
        # Keeps track of all the controller states.
        controller_states = np.zeros(self._num_communication_turns + 1,
                                     self._num_controllers,
                                     self._controller_state_size)
        # Keeps track of all controllers' selected actions (communication + output).
        controller_actions = np.zeros(self._num_communication_turns,
                                      self._num_controllers, 1)
        # List that will contain the output actions.
        output_actions = []

        return controller_states, controller_actions, output_actions

    def sample(self, environment_state, controller_ordering, eval=False):
        """Samples a (possibly incomplete) output set of controller actions.
        
        Args:
         environment_state: The state provided by the environment.
         controller_ordering: the ordering of controllers specified by the environment.
         eval: Whether this is a train / test episode.

        """
        meta_controller_state = self.get_meta_controller_state()
        self._meta_controller_states.append(meta_controller_state)

        # Sample a constraint from the meta-controller.
        if not eval:
            constraint = self._meta_controller.sample(meta_controller_state)
        else:
            constraint = self._meta_controller.best_action(
                meta_controller_state)

        self._tried_constraints[constraint] = 1
        self._curr_constraint = constraint

        controller_environment_states = self.get_controller_environment_states(
            environment_state)

        controller_subset = self._controller_subset_fn(controller_ordering,
                                                       self._done_controllers)

        orderings = self.construct_orderings()

        controller_states, controller_actions, output_actions = self.controller_bookkeeping_vars(
        )

        # Note: Currently only works when the subsets contain only 2 controllers due to the way
        # in which communication vectors are appended to the controller states.
        previous_turn_communication_vectors = [
            None, None
        ]  # The latest communication vectors.
        for comm_turn in xrange(self._num_communication_turns + 1):

            communication_vectors = np.zeros(self._num_controllers_per_subtask,
                                             self._num_primitive_actions)

            for i in xrange(np.size(controller_subset)):
                ordering = orderings[i]

                # Construct the controller state.
                controller_index = controller_subset[i]
                env_state = controller_environment_states[controller_index]
                prev_comm_vector = previous_turn_communication_vectors[
                    (i + 1) % self._num_controllers_per_subtask]
                controller_state = self.get_controller_state(
                    env_state, constraint, ordering, comm_turn,
                    prev_comm_vector)

                controller_states[comm_turn][i] = controller_state

                if not eval:
                    action = self._controller.sample(controller_state)
                else:
                    action = self._controller.best_action(controller_state)

                controller_actions[comm_turn][i] = action

                communication_vector = np.zeros(self._num_primitive_actions)
                communication_vector[action] = 1
                communication_vectors[i] = communication_vector
                previous_turn_communication_vectors[i] = communication_vector

                if comm_turn == self._num_communication_turns - 1:
                    output_actions.append(action)

        # Compute the intrinsic reward that all the controllers in the controller
        # subset receive.
        self._intrinsic_reward = self._critic_fn(controller_environment_states,
                                                 constraint, orderings,
                                                 output_actions)

        # Store the controller transitions.
        for comm_turn in xrange(self._num_communication_turns):
            for i in xrange(np.size(controller_subset)):
                controller_state = controller_states[comm_turn][i]
                controller_action = controller_actions[comm_turn][i]
                controller_next_state = controller_states[comm_turn + 1][i]
                controller_reward = 0
                controller_terminal = False
                if comm_turn == self._num_communication_turns - 1:
                    controller_reward = intrinsic_reward
                    controller_terminal = True

                self._controller.store(controller_state, controller_action,
                                       controller_reward,
                                       controller_next_state,
                                       controller_terminal, eval)

        # Reset/Update bookkeeping variables.
        if self._intrinsic_reward:
            for controller in controller_subset:
                self._done_controllers.append(controller)
            self._tried_constraints = self.reset_tried_constraints()

        return output_actions

    def best_action(self, environment_state):
        return self.sample(environment_state, eval=True)

    def store(self,
              state,
              output_actions,
              reward,
              next_state,
              terminal,
              eval=False):
        """Stores the current transition in the meta-controller's replay memory.
           The transition is stored in the replay memory of the controller.
           If the transition culminates in a subgoal's completion or a terminal state, a
           transition for the meta-controller is constructed and stored in its replay buffer.
           Args:
            state: current state
            action: primitive action taken
            reward: reward received from state-action pair
            next_state: next state
            terminal: extrinsic terminal (True or False)
            eval: whether the current episode is a train or eval episode.
        """

        curr_meta_controller_state = self._meta_controller_states()[-1]
        action = self._curr_constraint
        next_meta_controller_state = self.get_meta_controller_state()
        self._meta_controller_reward += reward

        self._meta_controller.store(curr_meta_controller_state,
                                    self._curr_constraint,
                                    self._meta_controller_reward,
                                    next_meta_controller_state, terminal, eval)

        self._meta_controller_state = None

    def update(self):
        self._controller.update()
        # Only update meta-controller right after a meta-controller transition has taken place,
        # which occurs only when either a subgoal has been completed or the agent has reached a
        # terminal state.
        if self._meta_controller_state is None:
            self._meta_controller.update()
示例#7
0
class HierarchicalDqnAgent(object):
    INTRINSIC_STEP_COST = -1  # Step cost for the controller.

    def __init__(self,
                 learning_rates=[0.1, 0.00025],
                 state_sizes=[0, 0],
                 subgoals=None,
                 num_subgoals=0,
                 num_primitive_actions=0,
                 meta_controller_state_fn=None,
                 check_subgoal_fn=None):
        """Initializes a hierarchical DQN agent.
           Args:
            learning_rates: learning rates of the meta-controller and controller agents.
            state_sizes: state sizes of the meta-controller and controller agents.
                         State sizes are assumed to be 1-dimensional.
            subgoals: array of subgoals for the meta-controller.
            num_subgoals: the action space of the meta-controller.
            num_primitive_actions: the action space of the controller.
            meta_controller_state_fn: function that returns the state of the meta-controller.
            check_subgoal_fn: function that checks if agent has satisfied a particular subgoal.
        """

        self._meta_controller = DqnAgent(state_dims=state_sizes[0],
                                         num_actions=num_subgoals,
                                         learning_rate=learning_rates[0],
                                         epsilon_end=0.01)

        self._controller = DqnAgent(learning_rate=learning_rates[1],
                                    num_actions=num_primitive_actions,
                                    state_dims=[state_sizes[1] + num_subgoals],
                                    epsilon_end=0.01)

        self._subgoals = subgoals
        self._num_subgoals = num_subgoals

        self._meta_controller_state_fn = meta_controller_state_fn
        self._check_subgoal_fn = check_subgoal_fn

        self._meta_controller_state = None
        self._curr_subgoal = None
        self._meta_controller_reward = 0
        self._intrinsic_time_step = 0
        self._episode = 0

    def get_meta_controller_state(self, state):
        returned_state = state
        if self._meta_controller_state_fn:
            returned_state = self._meta_controller_state_fn(
                state, self._original_state)

        return np.copy(returned_state)

    def get_controller_state(self, state, subgoal_index):
        # Concatenates the environment state with the current subgoal.

        # curr_subgoal is a 1-hot vector indicating the current subgoal selected by the meta-controller.
        curr_subgoal = np.array(self._subgoals[subgoal_index])

        # Concatenate the environment state with the subgoal.
        controller_state = np.array(state)
        controller_state = np.concatenate((controller_state, curr_subgoal),
                                          axis=0)

        return np.copy(controller_state)

    def intrinsic_reward(self, state, subgoal_index):
        # Intrinsically rewards the controller - this is the critic in the h-DQN algorithm.
        if self.subgoal_completed(state, subgoal_index):
            return 1
        else:
            return self.INTRINSIC_STEP_COST

    def subgoal_completed(self, state, subgoal_index):
        # Checks whether the controller has completed the currently specified subgoal.
        if self._check_subgoal_fn is None:
            return state == self._subgoals[subgoal_index]
        else:
            return self._check_subgoal_fn(state, subgoal_index)

    def store(self, state, action, reward, next_state, terminal, eval=False):
        """Stores the current transition in replay memory.
           The transition is stored in the replay memory of the controller.
           If the transition culminates in a subgoal's completion or a terminal state, a
           transition for the meta-controller is constructed and stored in its replay buffer.
           Args:
            state: current state
            action: primitive action taken
            reward: reward received from state-action pair
            next_state: next state
            terminal: extrinsic terminal (True or False)
            eval: Whether the current episode is a train or eval episode.
        """

        # Compute the controller state, reward, next state, and terminal.
        intrinsic_state = np.copy(
            self.get_controller_state(state, self._curr_subgoal))
        intrinsic_next_state = np.copy(
            self.get_controller_state(next_state, self._curr_subgoal))
        intrinsic_reward = self.intrinsic_reward(next_state,
                                                 self._curr_subgoal)
        subgoal_completed = self.subgoal_completed(next_state,
                                                   self._curr_subgoal)
        intrinsic_terminal = subgoal_completed or terminal

        # Store the controller transition in memory.
        self._controller.store(intrinsic_state, action, intrinsic_reward,
                               intrinsic_next_state, intrinsic_terminal, eval)

        self._meta_controller_reward += reward

        if terminal and not eval:
            self._episode += 1

        if subgoal_completed or terminal:

            # Store the meta-controller transition in memory.
            meta_controller_state = np.copy(self._meta_controller_state)
            next_meta_controller_state = np.copy(
                self.get_meta_controller_state(next_state))

            self._meta_controller.store(meta_controller_state,
                                        self._curr_subgoal,
                                        self._meta_controller_reward,
                                        next_meta_controller_state, terminal,
                                        eval)

            # Reset the current meta-controller state and current subgoal to be None
            # since the current subgoal is finished. Also reset the meta-controller's reward.
            self._meta_controller_state = None
            self._curr_subgoal = None
            self._meta_controller_reward = 0
            self._intrinsic_time_step = 0

    def sample(self, state):
        """Samples an action from the hierarchical DQN agent.
           Samples a subgoal if necessary from the meta-controller and samples a primitive action
           from the controller.
           Args:
            state: the current environment state.
           Returns:
            action: a sampled primitive action.
        """
        self._intrinsic_time_step += 1

        # If the meta-controller state is None, it means that either this is a new episode
        # or a subgoal has just been completed.
        if self._meta_controller_state is None:
            self._meta_controller_state = self.get_meta_controller_state(state)
            self._curr_subgoal = self._meta_controller.sample(
                [self._meta_controller_state])

        controller_state = self.get_controller_state(state, self._curr_subgoal)
        action = self._controller.sample(controller_state)

        return action

    def best_action(self, state):
        """Returns the greedy action from the hierarchical DQN agent.
           Gets the greedy subgoal if necessary from the meta-controller and gets
           the greedy primitive action from the controller.
           Args:
            state: the current environment state.
           Returns:
            action: the controller's greedy primitive action.
        """

        # If the meta-controller state is None, it means that either this is a new episode
        # or a subgoal has just been completed.
        if self._meta_controller_state is None:
            self._meta_controller_state = self.get_meta_controller_state(state)
            self._curr_subgoal = self._meta_controller.best_action(
                [self._meta_controller_state])

        controller_state = self.get_controller_state(state, self._curr_subgoal)
        action = self._controller.best_action(controller_state)
        return action

    def update(self):
        self._controller.update()
        # Only update meta-controller right after a meta-controller transition has taken place,
        # which occurs only when either a subgoal has been completed or the agnent has reached a
        # terminal state.
        if self._meta_controller_state is None:
            self._meta_controller.update()
示例#8
0
    def __init__(self,
                 learning_rates=[0.1, 0.00025],
                 state_sizes=[0, 0],
                 agent_types=['network', 'network'],
                 subgoals=None,
                 num_subgoals=0,
                 num_primitive_actions=0,
                 meta_controller_state_fn=None,
                 check_subgoal_fn=None,
                 use_extra_travel_penalty=False,
                 use_extra_bit_for_subgoal_center=False,
                 use_controller_dqn=False,
                 use_intrinsic_timeout=False,
                 use_memory=False,
                 memory_size=0,
                 pretrain_controller=False):
        print "h-DQN"
        print "Use extra travel penalty:"
        print use_extra_travel_penalty
        print "Use extra bit for subgoal center:"
        print use_extra_bit_for_subgoal_center
        print "Use controller dqn:"
        print use_controller_dqn
        print "Use intrinsic timeout:"
        print use_intrinsic_timeout
        print "Use memory:"
        print use_memory
        print "Memory size:"
        print memory_size
        print "Pretrain Controller:"
        print pretrain_controller
        """Initializes a hierarchical DQN agent.

           Args:
            learning_rates: learning rates of the meta-controller and controller agents.
            state_sizes: state sizes of the meta-controller and controller agents.
            agent_types: type of each agent - either tabular QLearning agent or Deep Q Network.
            subgoals: array of subgoals for the meta-controller.
            num_subgoals: the action space of the meta-controller.
            num_primitive_actions: the action space of the controller.
            meta_controller_state_fn: function that returns the state of the meta-controller.
            check_subgoal_fn: function that checks if agent has satisfied a particular subgoal.
            use_extra_travel_penalty: whether or not to penalize the meta-controller for bad instructions.
            use_extra_bit_for_subgoal_center: whether or not to use an extra bit to indicate whether
                                              agent is at center of a particular cluster.
            use_controller_dqn: whether to use regular dqn or controller dqn for the controller.
            use_intrinsic_timeout: whether or not to intrinsically timeout the controller.
        """
        if not use_extra_travel_penalty:
            self.EXTRA_TRAVEL_PENALTY = 0

        if use_extra_bit_for_subgoal_center:
            self.ARTIFICIAL_PENALTY = 0
            state_sizes[0] = state_sizes[0] * 2

        if not pretrain_controller:
            self.PRETRAIN_EPISODES = 0

        if use_memory:
            print "Decaying meta-controller epsilon faster!"
            self._meta_controller = LstmDqnAgent(num_actions=num_subgoals,
                                                 state_dims=[memory_size],
                                                 sequence_length=memory_size,
                                                 replay_memory_init_size=100,
                                                 target_update=100,
                                                 epsilon_end=0.01,
                                                 epsilon_decay_steps=5000)
        else:
            self._meta_controller = QLearningAgent(
                num_states=state_sizes[0],
                num_actions=num_subgoals,
                learning_rate=learning_rates[0],
                epsilon=0.1)
        if use_controller_dqn:
            self._controller = ControllerDqnAgent(
                learning_rate=learning_rates[1],
                num_actions=num_primitive_actions,
                state_dims=state_sizes[1],
                subgoal_dims=[num_subgoals])
        else:
            print "Epsilon end for controller is 0.01!"
            self._controller = DqnAgent(
                learning_rate=learning_rates[1],
                num_actions=num_primitive_actions,
                state_dims=[state_sizes[1][0] + num_subgoals],
                epsilon_end=0.01)  # CHANGED

        self._subgoals = subgoals
        self._num_subgoals = num_subgoals

        self._meta_controller_state_fn = meta_controller_state_fn
        self._check_subgoal_fn = check_subgoal_fn

        self._use_extra_bit_for_subgoal_center = use_extra_bit_for_subgoal_center
        self._use_controller_dqn = use_controller_dqn

        self._use_intrinsic_timeout = use_intrinsic_timeout

        self._use_memory = use_memory
        self._memory_size = memory_size

        self._meta_controller_state = None
        self._curr_subgoal = None
        self._meta_controller_reward = 0
        self._intermediate_clusters = []
        self._intermediate_dict = defaultdict(int)
        self._intermediate_clusters_dict = defaultdict(int)
        self._history = [0 for i in xrange(self._memory_size)]

        # Only used if use_extra_bit_for_subgoal_center is True.
        self._original_state = None

        self._next_meta_controller_state = None

        self._intrinsic_time_step = 0

        self._episode = 0
示例#9
0
class HierarchicalDqnAgent(object):
    INTRINSIC_STEP_COST = -1  # Step cost for the controller.

    INTRINSIC_TIME_OUT = 50  # Number of steps after which intrinsic episode ends.
    INTRINSIC_TIME_OUT_PENALTY = -10  # Penalty given to controller for timing out episode.

    ARTIFICIAL_PENALTY = -100  # Penalty given to the meta-controller for telling the
    # agent to go to the same cluster it is already in.
    EXTRA_TRAVEL_PENALTY = -1  # Penalty given to meta-controller if controller agent
    # travels through additional clusters to get to target cluster.
    PRETRAIN_EPISODES = 100

    def __init__(self,
                 learning_rates=[0.1, 0.00025],
                 state_sizes=[0, 0],
                 agent_types=['network', 'network'],
                 subgoals=None,
                 num_subgoals=0,
                 num_primitive_actions=0,
                 meta_controller_state_fn=None,
                 check_subgoal_fn=None,
                 use_extra_travel_penalty=False,
                 use_extra_bit_for_subgoal_center=False,
                 use_controller_dqn=False,
                 use_intrinsic_timeout=False,
                 use_memory=False,
                 memory_size=0,
                 pretrain_controller=False):
        print "h-DQN"
        print "Use extra travel penalty:"
        print use_extra_travel_penalty
        print "Use extra bit for subgoal center:"
        print use_extra_bit_for_subgoal_center
        print "Use controller dqn:"
        print use_controller_dqn
        print "Use intrinsic timeout:"
        print use_intrinsic_timeout
        print "Use memory:"
        print use_memory
        print "Memory size:"
        print memory_size
        print "Pretrain Controller:"
        print pretrain_controller
        """Initializes a hierarchical DQN agent.

           Args:
            learning_rates: learning rates of the meta-controller and controller agents.
            state_sizes: state sizes of the meta-controller and controller agents.
            agent_types: type of each agent - either tabular QLearning agent or Deep Q Network.
            subgoals: array of subgoals for the meta-controller.
            num_subgoals: the action space of the meta-controller.
            num_primitive_actions: the action space of the controller.
            meta_controller_state_fn: function that returns the state of the meta-controller.
            check_subgoal_fn: function that checks if agent has satisfied a particular subgoal.
            use_extra_travel_penalty: whether or not to penalize the meta-controller for bad instructions.
            use_extra_bit_for_subgoal_center: whether or not to use an extra bit to indicate whether
                                              agent is at center of a particular cluster.
            use_controller_dqn: whether to use regular dqn or controller dqn for the controller.
            use_intrinsic_timeout: whether or not to intrinsically timeout the controller.
        """
        if not use_extra_travel_penalty:
            self.EXTRA_TRAVEL_PENALTY = 0

        if use_extra_bit_for_subgoal_center:
            self.ARTIFICIAL_PENALTY = 0
            state_sizes[0] = state_sizes[0] * 2

        if not pretrain_controller:
            self.PRETRAIN_EPISODES = 0

        if use_memory:
            print "Decaying meta-controller epsilon faster!"
            self._meta_controller = LstmDqnAgent(num_actions=num_subgoals,
                                                 state_dims=[memory_size],
                                                 sequence_length=memory_size,
                                                 replay_memory_init_size=100,
                                                 target_update=100,
                                                 epsilon_end=0.01,
                                                 epsilon_decay_steps=5000)
        else:
            self._meta_controller = QLearningAgent(
                num_states=state_sizes[0],
                num_actions=num_subgoals,
                learning_rate=learning_rates[0],
                epsilon=0.1)
        if use_controller_dqn:
            self._controller = ControllerDqnAgent(
                learning_rate=learning_rates[1],
                num_actions=num_primitive_actions,
                state_dims=state_sizes[1],
                subgoal_dims=[num_subgoals])
        else:
            print "Epsilon end for controller is 0.01!"
            self._controller = DqnAgent(
                learning_rate=learning_rates[1],
                num_actions=num_primitive_actions,
                state_dims=[state_sizes[1][0] + num_subgoals],
                epsilon_end=0.01)  # CHANGED

        self._subgoals = subgoals
        self._num_subgoals = num_subgoals

        self._meta_controller_state_fn = meta_controller_state_fn
        self._check_subgoal_fn = check_subgoal_fn

        self._use_extra_bit_for_subgoal_center = use_extra_bit_for_subgoal_center
        self._use_controller_dqn = use_controller_dqn

        self._use_intrinsic_timeout = use_intrinsic_timeout

        self._use_memory = use_memory
        self._memory_size = memory_size

        self._meta_controller_state = None
        self._curr_subgoal = None
        self._meta_controller_reward = 0
        self._intermediate_clusters = []
        self._intermediate_dict = defaultdict(int)
        self._intermediate_clusters_dict = defaultdict(int)
        self._history = [0 for i in xrange(self._memory_size)]

        # Only used if use_extra_bit_for_subgoal_center is True.
        self._original_state = None

        self._next_meta_controller_state = None

        self._intrinsic_time_step = 0

        self._episode = 0

    def update_history(self, state):
        returned_state = state
        if self._meta_controller_state_fn:
            returned_state = self._meta_controller_state_fn(
                state, self._original_state)

        current_cluster_id = np.where(
            np.squeeze(returned_state) == 1)[0][0] + 1
        new_history = self._history[1:]

        # print "History update!"
        # print self._history
        # print new_history
        # print current_cluster_id
        new_history.append(current_cluster_id)
        # print new_history
        # print ""
        self._history = new_history

    def get_meta_controller_state(self, state):
        returned_state = state
        if self._meta_controller_state_fn:
            returned_state = self._meta_controller_state_fn(
                state, self._original_state)

        if self._use_memory:
            returned_state = self._history[:]

        return returned_state

    def get_controller_state(self, state, subgoal_index):
        curr_subgoal = self._subgoals[subgoal_index]

        # Concatenate the environment state with the subgoal.
        controller_state = list(state[0])
        for i in xrange(len(curr_subgoal)):
            controller_state.append(curr_subgoal[i])
        controller_state = np.array([controller_state])
        # print controller_state
        return np.copy(controller_state)

    def intrinsic_reward(self, state, subgoal_index):
        if self._use_intrinsic_timeout and self._intrinsic_time_step >= self.INTRINSIC_TIME_OUT:
            return self.INTRINSIC_TIME_OUT_PENALTY
        if self.subgoal_completed(state, subgoal_index):
            return 1
        else:
            return self.INTRINSIC_STEP_COST

    def subgoal_completed(self, state, subgoal_index):
        if self._check_subgoal_fn is None:
            if self._use_intrinsic_timeout and self._intrinsic_time_step >= self.INTRINSIC_TIME_OUT:
                return True
            return state == self._subgoals[subgoal_index]
        else:
            if self._use_intrinsic_timeout and self._intrinsic_time_step >= self.INTRINSIC_TIME_OUT:
                return True

            if not self._use_memory and self._meta_controller_state[
                    self._curr_subgoal] == 1:
                if np.sum(self._meta_controller_state) > 1:
                    return False

                return self._check_subgoal_fn(state, subgoal_index,
                                              self._original_state)
            else:
                return self._check_subgoal_fn(state, subgoal_index)

    def store(self, state, action, reward, next_state, terminal, eval=False):
        """Stores the current transition in replay memory.
           The transition is stored in the replay memory of the controller.
           If the transition culminates in a subgoal's completion or a terminal state, a
           transition for the meta-controller is constructed and stored in its replay buffer.

           Args:
            state: current state
            action: primitive action taken
            reward: reward received from state-action pair
            next_state: next state
            terminal: extrinsic terminal (True or False)
            eval: Whether the current episode is a train or eval episode.
        """

        self._meta_controller_reward += reward
        self._intrinsic_time_step += 1

        # Compute the controller state, reward, next state, and terminal.
        intrinsic_state = self.get_controller_state(state, self._curr_subgoal)
        intrinsic_next_state = self.get_controller_state(
            next_state, self._curr_subgoal)
        intrinsic_reward = self.intrinsic_reward(next_state,
                                                 self._curr_subgoal)
        subgoal_completed = self.subgoal_completed(next_state,
                                                   self._curr_subgoal)
        intrinsic_terminal = subgoal_completed or terminal

        self._controller.store(np.copy(intrinsic_state), action,
                               intrinsic_reward, np.copy(intrinsic_next_state),
                               intrinsic_terminal, eval)

        # Check for intermediate state.
        intermediate_meta_controller_state = self.get_meta_controller_state(
            next_state)

        if not self._use_memory:
            intermediate_cluster_id = np.where(
                np.squeeze(intermediate_meta_controller_state) == 1)[0][0]
        else:
            intermediate_cluster_id = intermediate_meta_controller_state[-1] - 1

        self._intermediate_dict[intermediate_cluster_id] += 1
        # Agent is traveling through a cluster that is not the starting or ending cluster.
        # FIX THIS!!!!
        if list(intermediate_meta_controller_state[0:self._num_subgoals]
                ) != list(self._meta_controller_state[0:self._num_subgoals]
                          ) and not subgoal_completed:
            self._meta_controller_reward += self.EXTRA_TRAVEL_PENALTY

            self._intermediate_clusters.append(intermediate_cluster_id)
            self._intermediate_clusters_dict[intermediate_cluster_id] += 1

        if terminal and not eval:
            self._episode += 1

        if subgoal_completed or terminal:
            # Normalize the meta-controller reward.
            self._meta_controller_reward /= 100.0

            meta_controller_state = np.copy(self._meta_controller_state)
            if not self._use_memory:
                next_meta_controller_state = self.get_meta_controller_state(
                    next_state)
            else:
                returned_state = self._meta_controller_state_fn(
                    next_state, self._original_state)
                current_cluster_id = np.where(
                    np.squeeze(returned_state) == 1)[0][0] + 1
                new_history = self._history[1:]
                new_history.append(current_cluster_id)
                next_meta_controller_state = new_history

            if self._episode >= self.PRETRAIN_EPISODES:
                self._meta_controller.store(
                    np.copy(meta_controller_state), self._curr_subgoal,
                    self._meta_controller_reward,
                    np.copy(next_meta_controller_state), terminal, eval,
                    reward)

            if eval:
                if subgoal_completed:
                    print "Subgoal completed!"
                    print "Intermediate Clusters:"
                    print self._intermediate_clusters
                    print "Intermediate Cluster Count:"
                    print self._intermediate_dict
                    print "Intermediate non-beginning cluster count:"
                    print self._intermediate_clusters_dict
                    print "State:"
                    print next_state
                    print "Meta-Controller reward:"
                    print self._meta_controller_reward
                    print "Intrinsic reward:"
                    print intrinsic_reward
                    print "Cluster:"
                    print next_meta_controller_state
                    print ""
                    print ""
                else:
                    print "Terminal!"
                    print "Intermediate clusters:"
                    print self._intermediate_clusters
                    print "Intermediate cluster count:"
                    print self._intermediate_dict
                    print "Intermediate non-beginning cluster count:"
                    print self._intermediate_clusters_dict
                    print "State:"
                    print next_state
                    print "Meta-Controller reward:"
                    print self._meta_controller_reward
                    print "Intrinsic reward:"
                    print intrinsic_reward
                    print "Cluster:"
                    print next_meta_controller_state
                    print ""
                    print ""

            # Reset the current meta-controller state and current subgoal to be None
            # since the current subgoal is finished. Also reset the meta-controller's reward.
            self._next_meta_controller_state = np.copy(
                next_meta_controller_state)

            if terminal:
                self._next_meta_controller_state = None

            self._meta_controller_state = None
            self._curr_subgoal = None
            self._meta_controller_reward = 0

            self._intermediate_clusters = []
            self._intermediate_dict = defaultdict(int)
            self._intermediate_clusters_dict = defaultdict(int)

            self._original_state = None
            self._intrinsic_time_step = 0

            if terminal:
                self._history = [0 for i in xrange(self._memory_size)]

    def sample(self, state):
        """Samples an action from the hierarchical DQN agent.
           Samples a subgoal if necessary from the meta-controller and samples a primitive action
           from the controller.

           Args:
            state: the current environment state.

           Returns:
            action: a primitive action.
        """
        if self._meta_controller_state is None:
            if self._use_memory:
                self.update_history(state)

            if self._next_meta_controller_state is not None and not self._use_memory:
                self._meta_controller_state = self._next_meta_controller_state
            else:
                self._meta_controller_state = self.get_meta_controller_state(
                    state)

            self._curr_subgoal = self._meta_controller.sample(
                [self._meta_controller_state])

            # Artificially penalize the meta-controller for picking the subgoal to
            # be the same as the current cluster.
            if self._use_memory:
                same_cluster_instruction = (self._meta_controller_state[-1] -
                                            1) == self._curr_subgoal
            else:
                same_cluster_instruction = self._meta_controller_state[
                    self._curr_subgoal] == 1

            if same_cluster_instruction:
                self._meta_controller_reward = self.ARTIFICIAL_PENALTY
                self._original_state = state

        controller_state = self.get_controller_state(state, self._curr_subgoal)
        action = self._controller.sample(controller_state)

        return action

    def best_action(self, state):
        """Returns the greedy action from the hierarchical DQN agent.
           Gets the greedy subgoal if necessary from the meta-controller and gets
           the greedy primitive action from the controller.

           Args:
            state: the current environment state.

           Returns:
            action: the controller's greedy primitive action.
        """
        returned_info = None

        if self._meta_controller_state is None:
            if self._use_memory:
                self.update_history(state)

            if self._next_meta_controller_state is not None and not self._use_memory:
                self._meta_controller_state = self._next_meta_controller_state
            else:
                self._meta_controller_state = self.get_meta_controller_state(
                    state)

            self._curr_subgoal = self._meta_controller.best_action(
                [self._meta_controller_state])

            returned_info = [self._meta_controller_state, self._curr_subgoal]

            # Artificially penalize the meta-controller for picking the subgoal to
            # be the same as the current cluster.
            if self._use_memory:
                same_cluster_instruction = (self._meta_controller_state[-1] -
                                            1) == self._curr_subgoal
            else:
                same_cluster_instruction = self._meta_controller_state[
                    self._curr_subgoal] == 1

            if same_cluster_instruction:
                self._meta_controller_reward = self.ARTIFICIAL_PENALTY
                self._original_state = state

            print "Current State:"
            print state
            print "Current Meta-Controller State:"
            print self._meta_controller_state
            print "Current subgoal picked:"
            print self._curr_subgoal

        controller_state = self.get_controller_state(state, self._curr_subgoal)
        action = self._controller.best_action(controller_state)
        return action, returned_info

    def update(self):
        self._controller.update()
        # Only update meta-controller right after a meta-controller transition has taken place,
        # which occurs only when either a subgoal has been completed or the agnent has reached a
        # terminal state.
        if self._meta_controller_state is None:
            self._meta_controller.update()