def __init__(self, learning_rates=[0.05, 0.00025], state_sizes=[0, 0], subgoals=None, num_subgoals=0, num_primitive_actions=0, meta_controller_state_fn=None, check_subgoal_fn=None, load=True): """Initializes a hierarchical DQN agent. Args: learning_rates: learning rates of the meta-controller and controller agents. state_sizes: state sizes of the meta-controller and controller agents. State sizes are assumed to be 1-dimensional. subgoals: array of subgoals for the meta-controller. num_subgoals: the action space of the meta-controller. num_primitive_actions: the action space of the controller. meta_controller_state_fn: function that returns the state of the meta-controller. check_subgoal_fn: function that checks if agent has satisfied a particular subgoal. """ subgoals = np.array(subgoals) self.meta_path = "weights/meta/model" self.control_path = "weights/control/model" meta, control = None, None if load: meta = self.meta_path control = self.control_path self._meta_controller = DqnAgent(state_dims=state_sizes, num_actions=subgoals.shape[0], learning_rate=learning_rates[0], epsilon_end=0.01, file_path=meta) self._controller = DqnAgent( learning_rate=learning_rates[1], num_actions=num_primitive_actions, state_dims=[state_sizes[0] + subgoals.shape[1]], epsilon_end=0.01, file_path=control) self._subgoals = subgoals self._num_subgoals = num_subgoals self._meta_controller_state_fn = meta_controller_state_fn self._check_subgoal_fn = check_subgoal_fn self._meta_controller_state = None self._curr_subgoal = None self._meta_controller_reward = 0 self._intrinsic_time_step = 0 self._episode = 0
def __init__(self, num_agents, state_size, num_actions, num_communication_turns): """Initializes a multi-agent RL agent. Used as a baseline in the FCRL experiments. Args: num_agents: Number of agents / controllers. state_size: The base size of each agent's state. num_actions: The number of actions. num_communication_turns: Number of turns for which the agents communicate. """ self._num_agents = num_agents self._num_actions = num_actions self._num_communication_turns = num_communication_turns self._base_state_size = state_size self._state_size = state_size + self._num_communication_turns * self._num_actions # Initialize the agents. self._agents = [] for i in xrange(self._num_agents): self._agents.append( DqnAgent(state_dims=self._state_size, num_actions=self._num_actions, learning_rate=0.1, epsilon_end=0.01)) self._communication_states = None
def __init__(self, learning_rates=[0.1, 0.00025], state_sizes=[0, 0], subgoals=None, num_subgoals=0, num_primitive_actions=0, meta_controller_state_fn=None, check_subgoal_fn=None): """Initializes a hierarchical DQN agent. Args: learning_rates: learning rates of the meta-controller and controller agents. state_sizes: state sizes of the meta-controller and controller agents. State sizes are assumed to be 1-dimensional. subgoals: array of subgoals for the meta-controller. num_subgoals: the action space of the meta-controller. num_primitive_actions: the action space of the controller. meta_controller_state_fn: function that returns the state of the meta-controller. check_subgoal_fn: function that checks if agent has satisfied a particular subgoal. """ self._meta_controller = DqnAgent(state_dims=state_sizes[0], num_actions=num_subgoals, learning_rate=learning_rates[0], epsilon_end=0.01) self._controller = DqnAgent(learning_rate=learning_rates[1], num_actions=num_primitive_actions, state_dims=[state_sizes[1] + num_subgoals], epsilon_end=0.01) self._subgoals = subgoals self._num_subgoals = num_subgoals self._meta_controller_state_fn = meta_controller_state_fn self._check_subgoal_fn = check_subgoal_fn self._meta_controller_state = None self._curr_subgoal = None self._meta_controller_reward = 0 self._intrinsic_time_step = 0 self._episode = 0 self._original_state = None # ndrw
def create_controllers(self,tsc): if tsc == 'uniform': for id in self.tl_ids: c = UniformController(self.conn, id, self.netdata, self.mode,self.red_t,self.yellow_t,self.green_t) self.Controllers[id] = c elif tsc == 'dqn': for id in self.tl_ids: # density and queue and current phase, +1 for all-red clearance pahse. state_size = len(self.netdata['inter'][id]['incoming_lanes'])*2 + len(self.netdata['inter'][id]['green_phases']) + 1 action_size = len(self.netdata['inter'][id]['green_phases']) print(state_size, action_size) rlagent = DqnAgent(self.args.batch_size,state_size,action_size) c = DqnController(self.conn, id, self.netdata, self.mode, rlagent, self.green_t, self.yellow_t, self.red_t) self.Controllers[id] = c
def __init__(self, learning_rates=[0.1, 0.00025], state_sizes=[0, 0], constraints=None, num_constraints=0, num_primitive_actions=0, num_controllers=0, num_controllers_per_subtask=0, num_communication_turns=0, critic_fn=None, controller_subset_fn=None): """Initializes a FCRL agent. Args: learning_rates: learning rates of the meta-controller and controller agents. state_sizes: state sizes of the meta-controller and controller agents. State sizes are assumed to be 1-dimensional. constraints: array of constraints for the meta-controller, which defines its action space. num_constraints: number of actions for the meta-controller. num_primitive_actions: number of actions for the controller. num_controllers: total number of controllers. num_controllers_per_subtask: the number of controllers that coordinate to complete a given subtask. num_communication_turns: the number of turns for which controllers communicate. critic_fn: a custom critic function for a particular environment. controller_subset_fn: a custom function that returns the next controller subset. """ self._meta_controller_state_size = state_sizes[0] self._num_controllers = num_controllers # Number of controllers that communicate to complete a subtask. self._num_controllers_per_subtask = num_controllers_per_subtask # A controller's state size is the input state size (the environment state) # + the ordering vector size (num_controllers_per_subtask) # + the communication vectors from the communication rounds and output round # (num_communication_turns * num_primitive_actions). self._controller_state_size = state_sizes[1] self._controller_state_size += self._num_controllers_per_subtask self._controller_state_size += num_communication_turns * num_primitive_actions self._meta_controller = DqnAgent(state_dims=state_sizes[0], num_actions=num_constraints, learning_rate=learning_rates[0], epsilon_end=0.01) self._controller = DqnAgent(learning_rate=learning_rates[1], num_actions=num_primitive_actions, state_dims=[self._controller_state_size], epsilon_end=0.01) self._constraints = constraints self._num_constraints = num_constraints self._num_primitive_actions = num_primitive_actions self._num_communication_turns = num_communication_turns self._critic_fn = critic_fn self._controller_subset_fn = controller_subset_fn self._intrinsic_time_step = 0 self._episode = 0 # Book-keeping variables. # Keeps track of the current meta-controller state. self._meta_controller_state = None # Keeps track of the current action selected by the meta-controller. self._curr_constraint = None # Keeps track of the meta-controller's reward for the current meta-controller time step. self._meta_controller_reward = 0 # Keeps track of the constraints tried for current controller subset. self._tried_constraints = self.reset_tried_constraints() # Keeps track of controllers who have completed coordination in the current episode. self._done_controllers = []
class FederatedControlAgent(object): def __init__(self, learning_rates=[0.1, 0.00025], state_sizes=[0, 0], constraints=None, num_constraints=0, num_primitive_actions=0, num_controllers=0, num_controllers_per_subtask=0, num_communication_turns=0, critic_fn=None, controller_subset_fn=None): """Initializes a FCRL agent. Args: learning_rates: learning rates of the meta-controller and controller agents. state_sizes: state sizes of the meta-controller and controller agents. State sizes are assumed to be 1-dimensional. constraints: array of constraints for the meta-controller, which defines its action space. num_constraints: number of actions for the meta-controller. num_primitive_actions: number of actions for the controller. num_controllers: total number of controllers. num_controllers_per_subtask: the number of controllers that coordinate to complete a given subtask. num_communication_turns: the number of turns for which controllers communicate. critic_fn: a custom critic function for a particular environment. controller_subset_fn: a custom function that returns the next controller subset. """ self._meta_controller_state_size = state_sizes[0] self._num_controllers = num_controllers # Number of controllers that communicate to complete a subtask. self._num_controllers_per_subtask = num_controllers_per_subtask # A controller's state size is the input state size (the environment state) # + the ordering vector size (num_controllers_per_subtask) # + the communication vectors from the communication rounds and output round # (num_communication_turns * num_primitive_actions). self._controller_state_size = state_sizes[1] self._controller_state_size += self._num_controllers_per_subtask self._controller_state_size += num_communication_turns * num_primitive_actions self._meta_controller = DqnAgent(state_dims=state_sizes[0], num_actions=num_constraints, learning_rate=learning_rates[0], epsilon_end=0.01) self._controller = DqnAgent(learning_rate=learning_rates[1], num_actions=num_primitive_actions, state_dims=[self._controller_state_size], epsilon_end=0.01) self._constraints = constraints self._num_constraints = num_constraints self._num_primitive_actions = num_primitive_actions self._num_communication_turns = num_communication_turns self._critic_fn = critic_fn self._controller_subset_fn = controller_subset_fn self._intrinsic_time_step = 0 self._episode = 0 # Book-keeping variables. # Keeps track of the current meta-controller state. self._meta_controller_state = None # Keeps track of the current action selected by the meta-controller. self._curr_constraint = None # Keeps track of the meta-controller's reward for the current meta-controller time step. self._meta_controller_reward = 0 # Keeps track of the constraints tried for current controller subset. self._tried_constraints = self.reset_tried_constraints() # Keeps track of controllers who have completed coordination in the current episode. self._done_controllers = [] def reset_tried_constraints(self): return np.zeros(self._num_constraints) def get_meta_controller_state(self): """ Returns the meta-controller state. Concatenatates vector representation of the largest selected primitive action with the tried constraints vector. """ state = np.zeros(self._num_primitive_actions) if len(self._selected_primitive_actions): selected_primitive_actions = np.array( self._selected_primitive_actions) max_primtive_action = np.max(selected_primitive_actions) state[max_primtive_action] = 1 state = np.concatenate((state, np.copy(self._tried_constraints)), axis=0) return state def get_controller_environment_states(env_state): """Returns an array of controller environment states.""" controller_environment_states = np.split(env_state, self._num_controllers) return controller_environment_states def get_controller_state(self, env_state, constraint, ordering, comm_turn, communication_vector=None): """ Returns the controller state containing the controller's environment state, constraint, ordering vector, and received communication vectors. Args: env_state: The environment state for the current controller. constraint: The constraint provided to the current controller. ordering: The current controller's position vector in the overall ordering. communication_vector: communication received from other controllers in the current communication turn. """ controller_state = np.zeros(self._controller_state_size) # Apply the constraint to the environment state. env_state_plus_constraint = np.logical_and(env_state, constraint).astype(int) env_state_size = np.size(env_state_plus_constraint) controller_state[0:env_state_size] = env_state_plus_constraint controller_state[env_state_size:env_state_size_size + self._num_controllers_per_subtask] = ordering if comm_turn >= 1: controller_state[( env_state_size + self._num_controllers_per_subtask + (comm_turn - 1) * num_primitive_actions):( env_state_size + self._num_controllers_per_subtask + comm_turn * num_primitive_actions)] = communication_vector return np.copy(controller_state) def intrinsic_reward(self, env_states, constraints, orderings, selected_actions): """Intrinsically rewards a subset of controllers using the provided critic function.""" return self._critic_fn(controller_states, constraints, orderings, selected_actions) def construct_orderings(self): orderings = [] for i in xrange(np.size(self._num_controllers_per_subtask)): ordering = np.zeros(self._num_controllers_per_subtask) ordering[i] = 1 orderings.append(ordering) return orderings def controller_bookkeeping_vars(self): """ Returns initilizations for controller states, actions, communications, and outputs. """ # Keeps track of all the controller states. controller_states = np.zeros(self._num_communication_turns + 1, self._num_controllers, self._controller_state_size) # Keeps track of all controllers' selected actions (communication + output). controller_actions = np.zeros(self._num_communication_turns, self._num_controllers, 1) # List that will contain the output actions. output_actions = [] return controller_states, controller_actions, output_actions def sample(self, environment_state, controller_ordering, eval=False): """Samples a (possibly incomplete) output set of controller actions. Args: environment_state: The state provided by the environment. controller_ordering: the ordering of controllers specified by the environment. eval: Whether this is a train / test episode. """ meta_controller_state = self.get_meta_controller_state() self._meta_controller_states.append(meta_controller_state) # Sample a constraint from the meta-controller. if not eval: constraint = self._meta_controller.sample(meta_controller_state) else: constraint = self._meta_controller.best_action( meta_controller_state) self._tried_constraints[constraint] = 1 self._curr_constraint = constraint controller_environment_states = self.get_controller_environment_states( environment_state) controller_subset = self._controller_subset_fn(controller_ordering, self._done_controllers) orderings = self.construct_orderings() controller_states, controller_actions, output_actions = self.controller_bookkeeping_vars( ) # Note: Currently only works when the subsets contain only 2 controllers due to the way # in which communication vectors are appended to the controller states. previous_turn_communication_vectors = [ None, None ] # The latest communication vectors. for comm_turn in xrange(self._num_communication_turns + 1): communication_vectors = np.zeros(self._num_controllers_per_subtask, self._num_primitive_actions) for i in xrange(np.size(controller_subset)): ordering = orderings[i] # Construct the controller state. controller_index = controller_subset[i] env_state = controller_environment_states[controller_index] prev_comm_vector = previous_turn_communication_vectors[ (i + 1) % self._num_controllers_per_subtask] controller_state = self.get_controller_state( env_state, constraint, ordering, comm_turn, prev_comm_vector) controller_states[comm_turn][i] = controller_state if not eval: action = self._controller.sample(controller_state) else: action = self._controller.best_action(controller_state) controller_actions[comm_turn][i] = action communication_vector = np.zeros(self._num_primitive_actions) communication_vector[action] = 1 communication_vectors[i] = communication_vector previous_turn_communication_vectors[i] = communication_vector if comm_turn == self._num_communication_turns - 1: output_actions.append(action) # Compute the intrinsic reward that all the controllers in the controller # subset receive. self._intrinsic_reward = self._critic_fn(controller_environment_states, constraint, orderings, output_actions) # Store the controller transitions. for comm_turn in xrange(self._num_communication_turns): for i in xrange(np.size(controller_subset)): controller_state = controller_states[comm_turn][i] controller_action = controller_actions[comm_turn][i] controller_next_state = controller_states[comm_turn + 1][i] controller_reward = 0 controller_terminal = False if comm_turn == self._num_communication_turns - 1: controller_reward = intrinsic_reward controller_terminal = True self._controller.store(controller_state, controller_action, controller_reward, controller_next_state, controller_terminal, eval) # Reset/Update bookkeeping variables. if self._intrinsic_reward: for controller in controller_subset: self._done_controllers.append(controller) self._tried_constraints = self.reset_tried_constraints() return output_actions def best_action(self, environment_state): return self.sample(environment_state, eval=True) def store(self, state, output_actions, reward, next_state, terminal, eval=False): """Stores the current transition in the meta-controller's replay memory. The transition is stored in the replay memory of the controller. If the transition culminates in a subgoal's completion or a terminal state, a transition for the meta-controller is constructed and stored in its replay buffer. Args: state: current state action: primitive action taken reward: reward received from state-action pair next_state: next state terminal: extrinsic terminal (True or False) eval: whether the current episode is a train or eval episode. """ curr_meta_controller_state = self._meta_controller_states()[-1] action = self._curr_constraint next_meta_controller_state = self.get_meta_controller_state() self._meta_controller_reward += reward self._meta_controller.store(curr_meta_controller_state, self._curr_constraint, self._meta_controller_reward, next_meta_controller_state, terminal, eval) self._meta_controller_state = None def update(self): self._controller.update() # Only update meta-controller right after a meta-controller transition has taken place, # which occurs only when either a subgoal has been completed or the agent has reached a # terminal state. if self._meta_controller_state is None: self._meta_controller.update()
class HierarchicalDqnAgent(object): INTRINSIC_STEP_COST = -1 # Step cost for the controller. def __init__(self, learning_rates=[0.1, 0.00025], state_sizes=[0, 0], subgoals=None, num_subgoals=0, num_primitive_actions=0, meta_controller_state_fn=None, check_subgoal_fn=None): """Initializes a hierarchical DQN agent. Args: learning_rates: learning rates of the meta-controller and controller agents. state_sizes: state sizes of the meta-controller and controller agents. State sizes are assumed to be 1-dimensional. subgoals: array of subgoals for the meta-controller. num_subgoals: the action space of the meta-controller. num_primitive_actions: the action space of the controller. meta_controller_state_fn: function that returns the state of the meta-controller. check_subgoal_fn: function that checks if agent has satisfied a particular subgoal. """ self._meta_controller = DqnAgent(state_dims=state_sizes[0], num_actions=num_subgoals, learning_rate=learning_rates[0], epsilon_end=0.01) self._controller = DqnAgent(learning_rate=learning_rates[1], num_actions=num_primitive_actions, state_dims=[state_sizes[1] + num_subgoals], epsilon_end=0.01) self._subgoals = subgoals self._num_subgoals = num_subgoals self._meta_controller_state_fn = meta_controller_state_fn self._check_subgoal_fn = check_subgoal_fn self._meta_controller_state = None self._curr_subgoal = None self._meta_controller_reward = 0 self._intrinsic_time_step = 0 self._episode = 0 def get_meta_controller_state(self, state): returned_state = state if self._meta_controller_state_fn: returned_state = self._meta_controller_state_fn( state, self._original_state) return np.copy(returned_state) def get_controller_state(self, state, subgoal_index): # Concatenates the environment state with the current subgoal. # curr_subgoal is a 1-hot vector indicating the current subgoal selected by the meta-controller. curr_subgoal = np.array(self._subgoals[subgoal_index]) # Concatenate the environment state with the subgoal. controller_state = np.array(state) controller_state = np.concatenate((controller_state, curr_subgoal), axis=0) return np.copy(controller_state) def intrinsic_reward(self, state, subgoal_index): # Intrinsically rewards the controller - this is the critic in the h-DQN algorithm. if self.subgoal_completed(state, subgoal_index): return 1 else: return self.INTRINSIC_STEP_COST def subgoal_completed(self, state, subgoal_index): # Checks whether the controller has completed the currently specified subgoal. if self._check_subgoal_fn is None: return state == self._subgoals[subgoal_index] else: return self._check_subgoal_fn(state, subgoal_index) def store(self, state, action, reward, next_state, terminal, eval=False): """Stores the current transition in replay memory. The transition is stored in the replay memory of the controller. If the transition culminates in a subgoal's completion or a terminal state, a transition for the meta-controller is constructed and stored in its replay buffer. Args: state: current state action: primitive action taken reward: reward received from state-action pair next_state: next state terminal: extrinsic terminal (True or False) eval: Whether the current episode is a train or eval episode. """ # Compute the controller state, reward, next state, and terminal. intrinsic_state = np.copy( self.get_controller_state(state, self._curr_subgoal)) intrinsic_next_state = np.copy( self.get_controller_state(next_state, self._curr_subgoal)) intrinsic_reward = self.intrinsic_reward(next_state, self._curr_subgoal) subgoal_completed = self.subgoal_completed(next_state, self._curr_subgoal) intrinsic_terminal = subgoal_completed or terminal # Store the controller transition in memory. self._controller.store(intrinsic_state, action, intrinsic_reward, intrinsic_next_state, intrinsic_terminal, eval) self._meta_controller_reward += reward if terminal and not eval: self._episode += 1 if subgoal_completed or terminal: # Store the meta-controller transition in memory. meta_controller_state = np.copy(self._meta_controller_state) next_meta_controller_state = np.copy( self.get_meta_controller_state(next_state)) self._meta_controller.store(meta_controller_state, self._curr_subgoal, self._meta_controller_reward, next_meta_controller_state, terminal, eval) # Reset the current meta-controller state and current subgoal to be None # since the current subgoal is finished. Also reset the meta-controller's reward. self._meta_controller_state = None self._curr_subgoal = None self._meta_controller_reward = 0 self._intrinsic_time_step = 0 def sample(self, state): """Samples an action from the hierarchical DQN agent. Samples a subgoal if necessary from the meta-controller and samples a primitive action from the controller. Args: state: the current environment state. Returns: action: a sampled primitive action. """ self._intrinsic_time_step += 1 # If the meta-controller state is None, it means that either this is a new episode # or a subgoal has just been completed. if self._meta_controller_state is None: self._meta_controller_state = self.get_meta_controller_state(state) self._curr_subgoal = self._meta_controller.sample( [self._meta_controller_state]) controller_state = self.get_controller_state(state, self._curr_subgoal) action = self._controller.sample(controller_state) return action def best_action(self, state): """Returns the greedy action from the hierarchical DQN agent. Gets the greedy subgoal if necessary from the meta-controller and gets the greedy primitive action from the controller. Args: state: the current environment state. Returns: action: the controller's greedy primitive action. """ # If the meta-controller state is None, it means that either this is a new episode # or a subgoal has just been completed. if self._meta_controller_state is None: self._meta_controller_state = self.get_meta_controller_state(state) self._curr_subgoal = self._meta_controller.best_action( [self._meta_controller_state]) controller_state = self.get_controller_state(state, self._curr_subgoal) action = self._controller.best_action(controller_state) return action def update(self): self._controller.update() # Only update meta-controller right after a meta-controller transition has taken place, # which occurs only when either a subgoal has been completed or the agnent has reached a # terminal state. if self._meta_controller_state is None: self._meta_controller.update()
def __init__(self, learning_rates=[0.1, 0.00025], state_sizes=[0, 0], agent_types=['network', 'network'], subgoals=None, num_subgoals=0, num_primitive_actions=0, meta_controller_state_fn=None, check_subgoal_fn=None, use_extra_travel_penalty=False, use_extra_bit_for_subgoal_center=False, use_controller_dqn=False, use_intrinsic_timeout=False, use_memory=False, memory_size=0, pretrain_controller=False): print "h-DQN" print "Use extra travel penalty:" print use_extra_travel_penalty print "Use extra bit for subgoal center:" print use_extra_bit_for_subgoal_center print "Use controller dqn:" print use_controller_dqn print "Use intrinsic timeout:" print use_intrinsic_timeout print "Use memory:" print use_memory print "Memory size:" print memory_size print "Pretrain Controller:" print pretrain_controller """Initializes a hierarchical DQN agent. Args: learning_rates: learning rates of the meta-controller and controller agents. state_sizes: state sizes of the meta-controller and controller agents. agent_types: type of each agent - either tabular QLearning agent or Deep Q Network. subgoals: array of subgoals for the meta-controller. num_subgoals: the action space of the meta-controller. num_primitive_actions: the action space of the controller. meta_controller_state_fn: function that returns the state of the meta-controller. check_subgoal_fn: function that checks if agent has satisfied a particular subgoal. use_extra_travel_penalty: whether or not to penalize the meta-controller for bad instructions. use_extra_bit_for_subgoal_center: whether or not to use an extra bit to indicate whether agent is at center of a particular cluster. use_controller_dqn: whether to use regular dqn or controller dqn for the controller. use_intrinsic_timeout: whether or not to intrinsically timeout the controller. """ if not use_extra_travel_penalty: self.EXTRA_TRAVEL_PENALTY = 0 if use_extra_bit_for_subgoal_center: self.ARTIFICIAL_PENALTY = 0 state_sizes[0] = state_sizes[0] * 2 if not pretrain_controller: self.PRETRAIN_EPISODES = 0 if use_memory: print "Decaying meta-controller epsilon faster!" self._meta_controller = LstmDqnAgent(num_actions=num_subgoals, state_dims=[memory_size], sequence_length=memory_size, replay_memory_init_size=100, target_update=100, epsilon_end=0.01, epsilon_decay_steps=5000) else: self._meta_controller = QLearningAgent( num_states=state_sizes[0], num_actions=num_subgoals, learning_rate=learning_rates[0], epsilon=0.1) if use_controller_dqn: self._controller = ControllerDqnAgent( learning_rate=learning_rates[1], num_actions=num_primitive_actions, state_dims=state_sizes[1], subgoal_dims=[num_subgoals]) else: print "Epsilon end for controller is 0.01!" self._controller = DqnAgent( learning_rate=learning_rates[1], num_actions=num_primitive_actions, state_dims=[state_sizes[1][0] + num_subgoals], epsilon_end=0.01) # CHANGED self._subgoals = subgoals self._num_subgoals = num_subgoals self._meta_controller_state_fn = meta_controller_state_fn self._check_subgoal_fn = check_subgoal_fn self._use_extra_bit_for_subgoal_center = use_extra_bit_for_subgoal_center self._use_controller_dqn = use_controller_dqn self._use_intrinsic_timeout = use_intrinsic_timeout self._use_memory = use_memory self._memory_size = memory_size self._meta_controller_state = None self._curr_subgoal = None self._meta_controller_reward = 0 self._intermediate_clusters = [] self._intermediate_dict = defaultdict(int) self._intermediate_clusters_dict = defaultdict(int) self._history = [0 for i in xrange(self._memory_size)] # Only used if use_extra_bit_for_subgoal_center is True. self._original_state = None self._next_meta_controller_state = None self._intrinsic_time_step = 0 self._episode = 0
class HierarchicalDqnAgent(object): INTRINSIC_STEP_COST = -1 # Step cost for the controller. INTRINSIC_TIME_OUT = 50 # Number of steps after which intrinsic episode ends. INTRINSIC_TIME_OUT_PENALTY = -10 # Penalty given to controller for timing out episode. ARTIFICIAL_PENALTY = -100 # Penalty given to the meta-controller for telling the # agent to go to the same cluster it is already in. EXTRA_TRAVEL_PENALTY = -1 # Penalty given to meta-controller if controller agent # travels through additional clusters to get to target cluster. PRETRAIN_EPISODES = 100 def __init__(self, learning_rates=[0.1, 0.00025], state_sizes=[0, 0], agent_types=['network', 'network'], subgoals=None, num_subgoals=0, num_primitive_actions=0, meta_controller_state_fn=None, check_subgoal_fn=None, use_extra_travel_penalty=False, use_extra_bit_for_subgoal_center=False, use_controller_dqn=False, use_intrinsic_timeout=False, use_memory=False, memory_size=0, pretrain_controller=False): print "h-DQN" print "Use extra travel penalty:" print use_extra_travel_penalty print "Use extra bit for subgoal center:" print use_extra_bit_for_subgoal_center print "Use controller dqn:" print use_controller_dqn print "Use intrinsic timeout:" print use_intrinsic_timeout print "Use memory:" print use_memory print "Memory size:" print memory_size print "Pretrain Controller:" print pretrain_controller """Initializes a hierarchical DQN agent. Args: learning_rates: learning rates of the meta-controller and controller agents. state_sizes: state sizes of the meta-controller and controller agents. agent_types: type of each agent - either tabular QLearning agent or Deep Q Network. subgoals: array of subgoals for the meta-controller. num_subgoals: the action space of the meta-controller. num_primitive_actions: the action space of the controller. meta_controller_state_fn: function that returns the state of the meta-controller. check_subgoal_fn: function that checks if agent has satisfied a particular subgoal. use_extra_travel_penalty: whether or not to penalize the meta-controller for bad instructions. use_extra_bit_for_subgoal_center: whether or not to use an extra bit to indicate whether agent is at center of a particular cluster. use_controller_dqn: whether to use regular dqn or controller dqn for the controller. use_intrinsic_timeout: whether or not to intrinsically timeout the controller. """ if not use_extra_travel_penalty: self.EXTRA_TRAVEL_PENALTY = 0 if use_extra_bit_for_subgoal_center: self.ARTIFICIAL_PENALTY = 0 state_sizes[0] = state_sizes[0] * 2 if not pretrain_controller: self.PRETRAIN_EPISODES = 0 if use_memory: print "Decaying meta-controller epsilon faster!" self._meta_controller = LstmDqnAgent(num_actions=num_subgoals, state_dims=[memory_size], sequence_length=memory_size, replay_memory_init_size=100, target_update=100, epsilon_end=0.01, epsilon_decay_steps=5000) else: self._meta_controller = QLearningAgent( num_states=state_sizes[0], num_actions=num_subgoals, learning_rate=learning_rates[0], epsilon=0.1) if use_controller_dqn: self._controller = ControllerDqnAgent( learning_rate=learning_rates[1], num_actions=num_primitive_actions, state_dims=state_sizes[1], subgoal_dims=[num_subgoals]) else: print "Epsilon end for controller is 0.01!" self._controller = DqnAgent( learning_rate=learning_rates[1], num_actions=num_primitive_actions, state_dims=[state_sizes[1][0] + num_subgoals], epsilon_end=0.01) # CHANGED self._subgoals = subgoals self._num_subgoals = num_subgoals self._meta_controller_state_fn = meta_controller_state_fn self._check_subgoal_fn = check_subgoal_fn self._use_extra_bit_for_subgoal_center = use_extra_bit_for_subgoal_center self._use_controller_dqn = use_controller_dqn self._use_intrinsic_timeout = use_intrinsic_timeout self._use_memory = use_memory self._memory_size = memory_size self._meta_controller_state = None self._curr_subgoal = None self._meta_controller_reward = 0 self._intermediate_clusters = [] self._intermediate_dict = defaultdict(int) self._intermediate_clusters_dict = defaultdict(int) self._history = [0 for i in xrange(self._memory_size)] # Only used if use_extra_bit_for_subgoal_center is True. self._original_state = None self._next_meta_controller_state = None self._intrinsic_time_step = 0 self._episode = 0 def update_history(self, state): returned_state = state if self._meta_controller_state_fn: returned_state = self._meta_controller_state_fn( state, self._original_state) current_cluster_id = np.where( np.squeeze(returned_state) == 1)[0][0] + 1 new_history = self._history[1:] # print "History update!" # print self._history # print new_history # print current_cluster_id new_history.append(current_cluster_id) # print new_history # print "" self._history = new_history def get_meta_controller_state(self, state): returned_state = state if self._meta_controller_state_fn: returned_state = self._meta_controller_state_fn( state, self._original_state) if self._use_memory: returned_state = self._history[:] return returned_state def get_controller_state(self, state, subgoal_index): curr_subgoal = self._subgoals[subgoal_index] # Concatenate the environment state with the subgoal. controller_state = list(state[0]) for i in xrange(len(curr_subgoal)): controller_state.append(curr_subgoal[i]) controller_state = np.array([controller_state]) # print controller_state return np.copy(controller_state) def intrinsic_reward(self, state, subgoal_index): if self._use_intrinsic_timeout and self._intrinsic_time_step >= self.INTRINSIC_TIME_OUT: return self.INTRINSIC_TIME_OUT_PENALTY if self.subgoal_completed(state, subgoal_index): return 1 else: return self.INTRINSIC_STEP_COST def subgoal_completed(self, state, subgoal_index): if self._check_subgoal_fn is None: if self._use_intrinsic_timeout and self._intrinsic_time_step >= self.INTRINSIC_TIME_OUT: return True return state == self._subgoals[subgoal_index] else: if self._use_intrinsic_timeout and self._intrinsic_time_step >= self.INTRINSIC_TIME_OUT: return True if not self._use_memory and self._meta_controller_state[ self._curr_subgoal] == 1: if np.sum(self._meta_controller_state) > 1: return False return self._check_subgoal_fn(state, subgoal_index, self._original_state) else: return self._check_subgoal_fn(state, subgoal_index) def store(self, state, action, reward, next_state, terminal, eval=False): """Stores the current transition in replay memory. The transition is stored in the replay memory of the controller. If the transition culminates in a subgoal's completion or a terminal state, a transition for the meta-controller is constructed and stored in its replay buffer. Args: state: current state action: primitive action taken reward: reward received from state-action pair next_state: next state terminal: extrinsic terminal (True or False) eval: Whether the current episode is a train or eval episode. """ self._meta_controller_reward += reward self._intrinsic_time_step += 1 # Compute the controller state, reward, next state, and terminal. intrinsic_state = self.get_controller_state(state, self._curr_subgoal) intrinsic_next_state = self.get_controller_state( next_state, self._curr_subgoal) intrinsic_reward = self.intrinsic_reward(next_state, self._curr_subgoal) subgoal_completed = self.subgoal_completed(next_state, self._curr_subgoal) intrinsic_terminal = subgoal_completed or terminal self._controller.store(np.copy(intrinsic_state), action, intrinsic_reward, np.copy(intrinsic_next_state), intrinsic_terminal, eval) # Check for intermediate state. intermediate_meta_controller_state = self.get_meta_controller_state( next_state) if not self._use_memory: intermediate_cluster_id = np.where( np.squeeze(intermediate_meta_controller_state) == 1)[0][0] else: intermediate_cluster_id = intermediate_meta_controller_state[-1] - 1 self._intermediate_dict[intermediate_cluster_id] += 1 # Agent is traveling through a cluster that is not the starting or ending cluster. # FIX THIS!!!! if list(intermediate_meta_controller_state[0:self._num_subgoals] ) != list(self._meta_controller_state[0:self._num_subgoals] ) and not subgoal_completed: self._meta_controller_reward += self.EXTRA_TRAVEL_PENALTY self._intermediate_clusters.append(intermediate_cluster_id) self._intermediate_clusters_dict[intermediate_cluster_id] += 1 if terminal and not eval: self._episode += 1 if subgoal_completed or terminal: # Normalize the meta-controller reward. self._meta_controller_reward /= 100.0 meta_controller_state = np.copy(self._meta_controller_state) if not self._use_memory: next_meta_controller_state = self.get_meta_controller_state( next_state) else: returned_state = self._meta_controller_state_fn( next_state, self._original_state) current_cluster_id = np.where( np.squeeze(returned_state) == 1)[0][0] + 1 new_history = self._history[1:] new_history.append(current_cluster_id) next_meta_controller_state = new_history if self._episode >= self.PRETRAIN_EPISODES: self._meta_controller.store( np.copy(meta_controller_state), self._curr_subgoal, self._meta_controller_reward, np.copy(next_meta_controller_state), terminal, eval, reward) if eval: if subgoal_completed: print "Subgoal completed!" print "Intermediate Clusters:" print self._intermediate_clusters print "Intermediate Cluster Count:" print self._intermediate_dict print "Intermediate non-beginning cluster count:" print self._intermediate_clusters_dict print "State:" print next_state print "Meta-Controller reward:" print self._meta_controller_reward print "Intrinsic reward:" print intrinsic_reward print "Cluster:" print next_meta_controller_state print "" print "" else: print "Terminal!" print "Intermediate clusters:" print self._intermediate_clusters print "Intermediate cluster count:" print self._intermediate_dict print "Intermediate non-beginning cluster count:" print self._intermediate_clusters_dict print "State:" print next_state print "Meta-Controller reward:" print self._meta_controller_reward print "Intrinsic reward:" print intrinsic_reward print "Cluster:" print next_meta_controller_state print "" print "" # Reset the current meta-controller state and current subgoal to be None # since the current subgoal is finished. Also reset the meta-controller's reward. self._next_meta_controller_state = np.copy( next_meta_controller_state) if terminal: self._next_meta_controller_state = None self._meta_controller_state = None self._curr_subgoal = None self._meta_controller_reward = 0 self._intermediate_clusters = [] self._intermediate_dict = defaultdict(int) self._intermediate_clusters_dict = defaultdict(int) self._original_state = None self._intrinsic_time_step = 0 if terminal: self._history = [0 for i in xrange(self._memory_size)] def sample(self, state): """Samples an action from the hierarchical DQN agent. Samples a subgoal if necessary from the meta-controller and samples a primitive action from the controller. Args: state: the current environment state. Returns: action: a primitive action. """ if self._meta_controller_state is None: if self._use_memory: self.update_history(state) if self._next_meta_controller_state is not None and not self._use_memory: self._meta_controller_state = self._next_meta_controller_state else: self._meta_controller_state = self.get_meta_controller_state( state) self._curr_subgoal = self._meta_controller.sample( [self._meta_controller_state]) # Artificially penalize the meta-controller for picking the subgoal to # be the same as the current cluster. if self._use_memory: same_cluster_instruction = (self._meta_controller_state[-1] - 1) == self._curr_subgoal else: same_cluster_instruction = self._meta_controller_state[ self._curr_subgoal] == 1 if same_cluster_instruction: self._meta_controller_reward = self.ARTIFICIAL_PENALTY self._original_state = state controller_state = self.get_controller_state(state, self._curr_subgoal) action = self._controller.sample(controller_state) return action def best_action(self, state): """Returns the greedy action from the hierarchical DQN agent. Gets the greedy subgoal if necessary from the meta-controller and gets the greedy primitive action from the controller. Args: state: the current environment state. Returns: action: the controller's greedy primitive action. """ returned_info = None if self._meta_controller_state is None: if self._use_memory: self.update_history(state) if self._next_meta_controller_state is not None and not self._use_memory: self._meta_controller_state = self._next_meta_controller_state else: self._meta_controller_state = self.get_meta_controller_state( state) self._curr_subgoal = self._meta_controller.best_action( [self._meta_controller_state]) returned_info = [self._meta_controller_state, self._curr_subgoal] # Artificially penalize the meta-controller for picking the subgoal to # be the same as the current cluster. if self._use_memory: same_cluster_instruction = (self._meta_controller_state[-1] - 1) == self._curr_subgoal else: same_cluster_instruction = self._meta_controller_state[ self._curr_subgoal] == 1 if same_cluster_instruction: self._meta_controller_reward = self.ARTIFICIAL_PENALTY self._original_state = state print "Current State:" print state print "Current Meta-Controller State:" print self._meta_controller_state print "Current subgoal picked:" print self._curr_subgoal controller_state = self.get_controller_state(state, self._curr_subgoal) action = self._controller.best_action(controller_state) return action, returned_info def update(self): self._controller.update() # Only update meta-controller right after a meta-controller transition has taken place, # which occurs only when either a subgoal has been completed or the agnent has reached a # terminal state. if self._meta_controller_state is None: self._meta_controller.update()