def __init__(self, input_dims, num_actions, learning_rate=2e-4, discount_factor=0.99, eps=1.0, eps_decrement_factor=1e-5, eps_min=0.1, replay_memory_size=10000, mini_batch_size=32): self.input_dims = input_dims self.num_actions = num_actions self.discount_factor = discount_factor self.eps_min = eps_min self.eps = eps self.eps_decrement_factor = eps_decrement_factor self.mini_batch_size = mini_batch_size #self.Q = LinearDQN(learning_rate, num_actions, input_dims) self.online_network = DualDeepQCNN(input_dims, self.num_actions, name='OnlineNetwork') self.target_network = DualDeepQCNN(input_dims, self.num_actions, name='TargetNetwork') self.replay_memory_size = replay_memory_size self.memory_bank = AgentMemory(self.replay_memory_size)
def setUp(self): self.agent = Mock(["send_chat"]) self.memory = AgentMemory() self.dialogue_stack = DialogueStack(self.agent, self.memory) self.dialogue_stack.append( BotStackStatus(agent=self.agent, memory=self.memory, dialogue_stack=self.dialogue_stack))
class MethodsTests(unittest.TestCase): def setUp(self): self.memory = AgentMemory() def test_peek_empty(self): self.assertEqual(self.memory.task_stack_peek(), None) def test_add_mob(self): # add mob chicken = {v: k for k, v in MOBS_BY_ID.items()}["chicken"] mob_id, mob_type, pos = 42, chicken, Pos(3, 4, 5) self.memory.set_mob_position(Mob(mob_id, mob_type, pos)) # get mob self.assertIsNotNone(self.memory.get_mob_by_eid(mob_id)) # update mob pos = Pos(6, 7, 8) self.memory.set_mob_position(Mob(mob_id, mob_type, pos)) # get mob mob_node = self.memory.get_mob_by_eid(mob_id) self.assertIsNotNone(mob_node) self.assertEqual(mob_node.pos, (6, 7, 8)) def test_add_guardian_mob(self): guardian = {v: k for k, v in MOBS_BY_ID.items()}["guardian"] mob_id, mob_type, pos = 42, guardian, Pos(3, 4, 5) self.memory.set_mob_position(Mob(mob_id, mob_type, pos))
class Agent(): '''Base class implementing functionality for different Deep Q Learning methods''' def __init__(self, env_name, input_dims, num_actions, learning_rate=2e-4, discount_factor=0.99, eps=1.0, eps_decrement_factor=1e-5, eps_min=0.1, replay_memory_size=10000, mini_batch_size=32): self.input_dims = input_dims self.num_actions = num_actions self.discount_factor = discount_factor self.eps_min = eps_min self.eps = eps self.eps_decrement_factor = eps_decrement_factor self.mini_batch_size = mini_batch_size self.replay_memory_size = replay_memory_size self.memory_bank = AgentMemory(self.replay_memory_size) self.env_name = env_name def get_greedy_action(self, observation): raise NotImplementedError def save_models(self): raise NotImplementedError def learn(self): raise NotImplementedError def save_models(self): self.online_network.save_checkpoint() self.target_network.save_checkpoint() def load_models(self): self.online_network.load_checkpoint() self.target_network.load_checkpoint() def store_memory(self, memory): self.memory_bank.remember(memory) def make_memory(self, state, action, reward, new_state, done): return np.array([state, np.long(action), float(reward), new_state, bool(done)]) def get_random_action(self, observation): # randint return is inclusive of final value return random.randint(0, num_actions-1) def decrement_epsilon(self): new_eps = self.eps - self.eps_decrement_factor self.eps = new_eps if new_eps > self.eps_min else self.eps_min def sample_memory(self): self.memory_bank.recall_batch( self.mini_batch_size) def copy_online_nn_to_target_nn(self): self.target_network.load_state_dict(self.online_network.state_dict())
def __init__(self, env_name, input_dims, num_actions, learning_rate=2e-4, discount_factor=0.99, eps=1.0, eps_decrement_factor=1e-5, eps_min=0.1, replay_memory_size=10000, mini_batch_size=32): self.input_dims = input_dims self.num_actions = num_actions self.discount_factor = discount_factor self.eps_min = eps_min self.eps = eps self.eps_decrement_factor = eps_decrement_factor self.mini_batch_size = mini_batch_size self.replay_memory_size = replay_memory_size self.memory_bank = AgentMemory(self.replay_memory_size) self.env_name = env_name
def __init__(self, env, brain_name, config, policy=None, critic=None): """ Constructor methods to create the controller Parameters ---------- env - Unity environment for the agent to solve brain_name, string, brain name used in conjunction with the environment config - Dictionary containing the following keys: - 'num_episodes', int, number of episodes to run the agent for - 'epsilon_start', float, initial value for epsilon used in the PPO algorithm to clip the surrogate - 'epsilon_decay', float, rate of decay for epsilon, applied after every episode - 'gamma', float, discount rate for future rewards - 'tau', float, rate for the soft update of the target network - 'max_memory', int, size of the replay buffer in number of samples - 'update_every', int, update frequency, in number of steps - 'train_iterations', int, number of training passes over a data batch - 'mlp_layers', int tuple, shape of the multilayer perceptron model - 'learning_rate', float, learning rate for the training of the model - 'std', float, standard deviation used for the Normal distribution of the policy - 'state_size', int - 'action_size', int - 'num_agents', int, number of agents running in parallel in the environment - 'policy', optional, used to pass a mock policy for testing purposes - 'critic', optional, used to pass a mock critic for testing purposes """ self.env = env self.brain_name = brain_name self.__dict__.update(config.as_dict()) self.policy = Policy(config, self.state_size, self.action_size) if policy is None else policy self.trained_critic = Critic( config, self.state_size) if critic is None else critic self.target_critic = Critic( config, self.state_size) if critic is None else critic self.target_critic.eval() self.memory = AgentMemory( ((self.num_agents, self.state_size), (self.num_agents, self.action_size), (self.num_agents, ), (self.num_agents, self.state_size), (self.num_agents, ), (self.num_agents, )), int(self.max_memory)) self.epsilon = config.epsilon_start self.scores = [] self.surrogates = [] self.optimizer = optim.Adam([{ 'params': self.policy.parameters() }, { 'params': self.trained_critic.parameters() }], lr=config.learning_rate)
def setUp(self): self.memory = AgentMemory( load_minecraft_specs=False) # don't load specs, it's slow self.agent = FakeAgent(self.memory) self.dialogue_manager = TtadModelDialogueManager( self.agent, None, None, None, no_ground_truth_actions=True) # More helpful error message to encourage test writers to use self.set_looking_at() self.agent.get_player_line_of_sight = Mock( side_effect=NotImplementedError( "Cannot call into C++ function in this unit test. " + "Call self.set_looking_at() to set the return value")) # Add a speaker at position (5, 63, 5) looking in the +x direction self.memory.update(self.agent) self.speaker = list(self.memory.other_players.values())[0].name
class BotStackStatusTest(unittest.TestCase): def setUp(self): self.agent = Mock(["send_chat"]) self.memory = AgentMemory() self.dialogue_stack = DialogueStack(self.agent, self.memory) self.dialogue_stack.append( BotStackStatus(agent=self.agent, memory=self.memory, dialogue_stack=self.dialogue_stack)) def test_move(self): self.memory.task_stack_push( tasks.Move(self.agent, {"target": (42, 42, 42)})) self.memory.add_chat("test_agent", "test chat: where are you going?") self.dialogue_stack.step() self.agent.send_chat.assert_called()
def __init__(self, env, brain_name, config): """ Constructor methods to create the controller Parameters ---------- env - Unity environment for the agent to solve brain_name, string, brain name used in conjunction with the environment config - Dictionary containing the following keys: - 'num_episodes', int, number of episodes to run the agent for - 'gamma', float, discount rate for future rewards - 'tau', float, rate for the soft update of the target network - 'max_memory', int, size of the replay buffer in number of samples - 'batch_size', int, size of the batches sampled to train the model on each update - 'update_every', int, update frequency, in number of steps - 'mlp_layers', int tuple, shape of the multilayer perceptron model - 'learning_rate', float, learning rate for the training of the model - 'state_size', int - 'action_size', int - 'num_agents', int, number of agents running in parallel in the environment """ self.env = env self.brain_name = brain_name self.__dict__.update(config.as_dict()) self.trained_policy = Policy(config, self.state_size, self.action_size) self.target_policy = Policy(config, self.state_size, self.action_size) self.trained_critic = Critic(config, self.state_size, self.action_size) self.target_critic = Critic(config, self.state_size, self.action_size) # those networks will never be trained self.target_policy.eval() self.target_critic.eval() self.memory = AgentMemory(((self.num_agents, self.state_size), (self.num_agents, self.action_size), (self.num_agents, self.state_size), (self.num_agents, ), (self.num_agents, )), int(self.max_memory)) self.scores = [] self.critic_losses = [] self.surrogates = [] self.critic_optimizer = optim.Adam(self.trained_critic.parameters(), lr=config.learning_rate) self.policy_optimizer = optim.Adam(self.trained_policy.parameters(), lr=config.learning_rate)
def setUp(self): self.memory = AgentMemory()
class BaseCraftassistTestCase(unittest.TestCase): def setUp(self): self.memory = AgentMemory( load_minecraft_specs=False) # don't load specs, it's slow self.agent = FakeAgent(self.memory) self.dialogue_manager = TtadModelDialogueManager( self.agent, None, None, None, no_ground_truth_actions=True) # More helpful error message to encourage test writers to use self.set_looking_at() self.agent.get_player_line_of_sight = Mock( side_effect=NotImplementedError( "Cannot call into C++ function in this unit test. " + "Call self.set_looking_at() to set the return value")) # Add a speaker at position (5, 63, 5) looking in the +x direction self.memory.update(self.agent) self.speaker = list(self.memory.other_players.values())[0].name def handle_action_dict(self, d, answer: str = None, stop_on_chat=False, max_steps=10000) -> Dict[XYZ, IDM]: """Handle an action dict and call self.flush() If "answer" is specified and a question is asked by the agent, respond with this string. If "stop_on_chat" is specified, stop iterating if the agent says anything """ self.add_incoming_chat("TEST {}".format(d)) obj = self.dialogue_manager.handle_action_dict(self.speaker, d) if obj is not None: self.dialogue_manager.dialogue_stack.append(obj) changes = self.flush(max_steps, stop_on_chat=stop_on_chat) if len(self.dialogue_manager.dialogue_stack ) != 0 and answer is not None: self.add_incoming_chat(answer) changes.update(self.flush(max_steps, stop_on_chat=stop_on_chat)) return changes def flush(self, max_steps=10000, stop_on_chat=False) -> Dict[XYZ, IDM]: """Update memory and step the dialogue and task stacks until they are empty If "stop_on_chat" is specified, stop iterating if the agent says anything Return the set of blocks that were changed. """ if stop_on_chat: self.agent.clear_outgoing_chats() world_before = self.agent._world.copy() for _ in range(max_steps): if (len(self.dialogue_manager.dialogue_stack) == 0 and not self.memory.task_stack_peek()): break self.memory.update(self.agent) self.dialogue_manager.dialogue_stack.step() self.agent.task_step() if (isinstance(self.dialogue_manager.dialogue_stack.peek(), AwaitResponse) and not self.dialogue_manager.dialogue_stack.peek().finished ) or (stop_on_chat and self.agent.get_last_outgoing_chat()): break self.memory.update(self.agent) # get changes world_after = self.agent._world.copy() changes = dict(set(world_after.items()) - set(world_before.items())) changes.update({ k: (0, 0) for k in set(world_before.keys()) - set(world_after.keys()) }) return changes def set_looking_at(self, xyz: XYZ): """Set the return value for C++ call to get_player_line_of_sight""" self.agent.get_player_line_of_sight = Mock(return_value=Pos(*xyz)) def set_blocks(self, xyzbms: List[Block], origin: XYZ = (0, 0, 0)): """Change the state of the world, block by block""" for xyz, idm in xyzbms: abs_xyz = tuple(np.array(xyz) + origin) self.memory.on_block_changed(abs_xyz, idm) self.agent._world[abs_xyz] = idm def add_object(self, xyzbms: List[Block], origin: XYZ = (0, 0, 0)) -> ObjectNode: """Add an object to memory as if it was placed block by block Args: - xyzbms: a list of relative (xyz, idm) - origin: (x, y, z) of the corner Returns an ObjectNode """ self.set_blocks(xyzbms, origin) abs_xyz = tuple(np.array(xyzbms[0][0]) + origin) memid = self.memory.get_block_object_ids_by_xyz(abs_xyz)[0] return self.memory.get_object_by_id(memid) def get_blocks(self, xyzs: Sequence[XYZ]) -> Dict[XYZ, IDM]: """Return the ground truth block state""" d = {} for (x, y, z) in xyzs: B = self.agent.get_blocks(x, x, y, y, z, z) d[(x, y, z)] = tuple(B[0, 0, 0, :]) return d def add_incoming_chat(self, chat: str): """Add a chat to memory as if it was just spoken by SPEAKER""" self.memory.add_chat( self.memory.get_player_by_name(self.speaker).memid, chat) def assert_schematics_equal(self, a, b): """Check equality between two list[(xyz, idm)] schematics N.B. this compares the shapes and idms, but ignores absolute position offsets. """ a, _ = to_relative_pos(a) b, _ = to_relative_pos(b) self.assertEqual(set(a), set(b)) def last_outgoing_chat(self) -> str: return self.agent.get_last_outgoing_chat() def get_speaker_pos(self) -> XYZ: return tuple( pos_to_np(self.memory.get_player_struct_by_name(self.speaker).pos))
class Agent(): def __init__(self, input_dims, num_actions, learning_rate=2e-4, discount_factor=0.99, eps=1.0, eps_decrement_factor=1e-5, eps_min=0.1, replay_memory_size=10000, mini_batch_size=32): self.input_dims = input_dims self.num_actions = num_actions self.discount_factor = discount_factor self.eps_min = eps_min self.eps = eps self.eps_decrement_factor = eps_decrement_factor self.mini_batch_size = mini_batch_size #self.Q = LinearDQN(learning_rate, num_actions, input_dims) self.online_network = DeepQCNN( input_dims, self.num_actions, name='OnlineNetwork') self.target_network = DeepQCNN( input_dims, self.num_actions, name='TargetNetwork') self.replay_memory_size = replay_memory_size self.memory_bank = AgentMemory(self.replay_memory_size) def decrement_epsilon(self): new_eps = self.eps - self.eps_decrement_factor self.eps = new_eps if new_eps > self.eps_min else self.eps_min def store_memory(self, memory): self.memory_bank.remember(memory) def make_memory(self, state, action, reward, new_state, done): return np.array([state, np.long(action), float(reward), new_state, bool(done)]) def get_greedy_action(self, observation): # convert obs to tensor, pass to device, forward pass, argmax obs_t = T.tensor(observation).to( self.online_network.device, dtype=T.float) action = self.target_network.forward(obs_t) return action.argmax().item() def get_random_action(self, observation): # randint return is inclusive of final value return random.randint(0, num_actions-1) def train_online_network(self): pass def save_models(self): self.online_network.save_checkpoint() self.target_network.save_checkpoint() def load_models(self): self.online_network.load_checkpoint() self.target_network.load_checkpoint() #replay_memory_training_data = self.memory_bank.recall_batch(mini_batch_size) # need is an array of arrays outer array (batchsize, 2), inner array(training data, targets) # self.online_network.fit() def update_target_network(self): pass def copy_online_nn_to_target_nn(self): self.target_network.load_state_dict(self.online_network.state_dict())
def setUp(self): self.memory = AgentMemory(load_minecraft_specs=False) # don't load specs, it's slow self.agent = FakeAgent(self.memory) self.dialogue_manager = TtadModelDialogueManager( self.agent, None, None, None, None, None, no_ground_truth_actions=True ) # More helpful error message to encourage test writers to use self.set_looking_at() self.agent.get_player_line_of_sight = Mock( side_effect=NotImplementedError( "Cannot call into C++ function in this unit test. " + "Call self.set_looking_at() to set the return value" ) ) # Add a speaker at position (5, 63, 5) looking in the +x direction self.memory.update(self.agent) self.speaker = list(self.memory.other_players.values())[0].name # Combinable actions to be used in test cases self.possible_actions = { "destroy_speaker_look": { "action_type": "DESTROY", "reference_object": {"location": {"location_type": "SPEAKER_LOOK"}}, }, "copy_speaker_look_to_agent_pos": { "action_type": "BUILD", "reference_object": {"location": {"location_type": "SPEAKER_LOOK"}}, "location": {"location_type": "AGENT_POS"}, }, "build_small_sphere": { "action_type": "BUILD", "schematic": {"has_name": "sphere", "has_size": "small"}, }, "build_1x1x1_cube": { "action_type": "BUILD", "schematic": {"has_name": "cube", "has_size": "1 x 1 x 1"}, }, "move_speaker_pos": { "action_type": "MOVE", "location": {"location_type": "SPEAKER_POS"}, }, "build_diamond": {"action_type": "BUILD", "schematic": {"has_name": "diamond"}}, "build_gold_cube": { "action_type": "BUILD", "schematic": {"has_block_type": "gold", "has_name": "cube"}, }, "fill_all_holes_speaker_look": { "action_type": "FILL", "location": {"location_type": "SPEAKER_LOOK"}, "repeat": {"repeat_key": "ALL"}, }, "go_to_tree": { "action_type": "MOVE", "location": { "location_type": "REFERENCE_OBJECT", "reference_object": {"has_name": "tree"}, }, }, "build_square_height_1": { "action_type": "BUILD", "schematic": {"has_name": "square", "has_height": "1"}, }, "stop": {"action_type": "STOP"}, "fill_speaker_look": { "action_type": "FILL", "location": {"location_type": "SPEAKER_LOOK"}, }, "fill_speaker_look_gold": { "action_type": "FILL", "has_block_type": "gold", "location": {"location_type": "SPEAKER_LOOK"}, }, }
class DDPGController: """ Deep learning agent based on Deep Deterministic Policy Gradient described in https://arxiv.org/pdf/1509.02971.pdf """ def __init__(self, env, brain_name, config): """ Constructor methods to create the controller Parameters ---------- env - Unity environment for the agent to solve brain_name, string, brain name used in conjunction with the environment config - Dictionary containing the following keys: - 'num_episodes', int, number of episodes to run the agent for - 'gamma', float, discount rate for future rewards - 'tau', float, rate for the soft update of the target network - 'max_memory', int, size of the replay buffer in number of samples - 'batch_size', int, size of the batches sampled to train the model on each update - 'update_every', int, update frequency, in number of steps - 'mlp_layers', int tuple, shape of the multilayer perceptron model - 'learning_rate', float, learning rate for the training of the model - 'state_size', int - 'action_size', int - 'num_agents', int, number of agents running in parallel in the environment """ self.env = env self.brain_name = brain_name self.__dict__.update(config.as_dict()) self.trained_policy = Policy(config, self.state_size, self.action_size) self.target_policy = Policy(config, self.state_size, self.action_size) self.trained_critic = Critic(config, self.state_size, self.action_size) self.target_critic = Critic(config, self.state_size, self.action_size) # those networks will never be trained self.target_policy.eval() self.target_critic.eval() self.memory = AgentMemory(((self.num_agents, self.state_size), (self.num_agents, self.action_size), (self.num_agents, self.state_size), (self.num_agents, ), (self.num_agents, )), int(self.max_memory)) self.scores = [] self.critic_losses = [] self.surrogates = [] self.critic_optimizer = optim.Adam(self.trained_critic.parameters(), lr=config.learning_rate) self.policy_optimizer = optim.Adam(self.trained_policy.parameters(), lr=config.learning_rate) def solve(self): """ Main method to launch the environment loop """ step = 1 for i_episode in range(1, self.num_episodes + 1): env_info = self.env.reset(train_mode=True)[self.brain_name] state = env_info.vector_observations rewards = [] surrogates = [] critic_losses = [] while True: action = self.act(state) env_info = self.env.step(action)[self.brain_name] next_state = env_info.vector_observations reward = env_info.rewards done = env_info.local_done self.memory.add((state, action, next_state, reward, done)) state = next_state rewards.append(reward) if self.memory.size >= self.batch_size and not step % self.update_every: surrogate_buffer, critic_loss = self.train() surrogates.append(surrogate_buffer) critic_losses.append(critic_loss) step += 1 if np.any(done): break self.scores.append(np.mean(np.sum(rewards, axis=0))) self.surrogates.append(np.mean(surrogates)) self.critic_losses.append(np.mean(critic_losses)) self.print_status(i_episode) return self.scores, self.surrogates, self.critic_losses def act(self, states): """ Based on states, returns the on-policy actions Parameter --------- states - float array shape=(num_agents, state_size) Return --------- Float array shape=(num_agents, action_size), chosen action """ states = torch.from_numpy(states).float().to(device) self.trained_policy.eval() with torch.no_grad(): actions = self.trained_policy(states) # TODO: add exploration noise return actions.cpu().data.numpy() def train(self): """ Training routine to update the policy and critic """ states, actions, next_states, rewards, dones = self.memory.sample( self.batch_size) states = torch.from_numpy(states).float().to(device) actions = torch.from_numpy(actions).float().to(device) next_states = torch.from_numpy(next_states).float().to(device) rewards = torch.from_numpy(rewards).float().to(device) dones = torch.from_numpy(dones).float().to(device) # critic update next_actions = self.target_policy(next_states) self.trained_critic.train() self.critic_optimizer.zero_grad() done_mask = 1 - dones target_states_values = rewards + self.gamma * \ self.target_critic(next_states, next_actions) * done_mask predicted_states_values = self.trained_critic(states, actions) critic_loss = torch.mean( (target_states_values - predicted_states_values)**2) critic_loss.backward() self.critic_optimizer.step() # policy update self.trained_policy.train() self.policy_optimizer.zero_grad() action_values = self.trained_critic(states, self.trained_policy(states)) surrogate = -torch.mean(action_values) surrogate.backward() self.policy_optimizer.step() self.target_network_update(self.trained_critic, self.target_critic) self.target_network_update(self.trained_policy, self.target_policy) return surrogate.cpu().data.numpy(), critic_loss.cpu().data.numpy() def target_network_update(self, trained_model, target_model): """ Performs a soft update with rate tau from the trained_model to the target_model. """ target_model_weights = target_model.get_weights() train_model_weights = trained_model.get_weights() new_weights = [] for w1, w2 in zip(target_model_weights, train_model_weights): new_weights.append(w1 * (1 - self.tau) + w2 * self.tau) target_model.set_weights(new_weights) def print_status(self, i_episode): """ Print the latest status of the agent Parameter --------- i_episode, int """ print( "\rEpisode %d/%d | Average Score: %.2f | Surrogate: %.5f | Critic loss: %.5f " % (i_episode, self.num_episodes, self.scores[-1], self.surrogates[-1], self.critic_losses[-1]), end="") sys.stdout.flush()
class Agent(): def __init__(self, input_dims, num_actions, learning_rate=2e-4, discount_factor=0.99, eps=1.0, eps_decrement_factor=1e-5, eps_min=0.1, replay_memory_size=10000, mini_batch_size=32): self.input_dims = input_dims self.num_actions = num_actions self.discount_factor = discount_factor self.eps_min = eps_min self.eps = eps self.eps_decrement_factor = eps_decrement_factor self.mini_batch_size = mini_batch_size #self.Q = LinearDQN(learning_rate, num_actions, input_dims) self.online_network = DualDeepQCNN(input_dims, self.num_actions, name='OnlineNetwork') self.target_network = DualDeepQCNN(input_dims, self.num_actions, name='TargetNetwork') self.replay_memory_size = replay_memory_size self.memory_bank = AgentMemory(self.replay_memory_size) def decrement_epsilon(self): new_eps = self.eps - self.eps_decrement_factor self.eps = new_eps if new_eps > self.eps_min else self.eps_min def store_memory(self, memory): self.memory_bank.remember(memory) def make_memory(self, state, action, reward, new_state, done): return np.array( [state, np.long(action), float(reward), new_state, bool(done)]) def get_greedy_action(self, observation): # convert obs to tensor, pass to device, forward pass, argmax obs_t = T.tensor(observation).to(self.online_network.device, dtype=T.float) #current value of state, and subtracting average value of best action do not matter as it only results in scaling of the actions without any change in ordering. action_v, action_a = self.target_network.forward(obs_t) return action_a.argmax().item() def get_random_action(self, observation): # randint return is inclusive of final value return random.randint(0, num_actions - 1) def train_online_network(self): pass def save_models(self): self.online_network.save_checkpoint() self.target_network.save_checkpoint() def load_models(self): self.online_network.load_checkpoint() self.target_network.load_checkpoint() def update_target_network(self): pass def copy_online_nn_to_target_nn(self): self.target_network.load_state_dict(self.online_network.state_dict())
class PPOController: """ Deep learning agent based on Proximal Policy Optimization, based on https://arxiv.org/pdf/1506.02438.pdf """ def __init__(self, env, brain_name, config, policy=None, critic=None): """ Constructor methods to create the controller Parameters ---------- env - Unity environment for the agent to solve brain_name, string, brain name used in conjunction with the environment config - Dictionary containing the following keys: - 'num_episodes', int, number of episodes to run the agent for - 'epsilon_start', float, initial value for epsilon used in the PPO algorithm to clip the surrogate - 'epsilon_decay', float, rate of decay for epsilon, applied after every episode - 'gamma', float, discount rate for future rewards - 'tau', float, rate for the soft update of the target network - 'max_memory', int, size of the replay buffer in number of samples - 'update_every', int, update frequency, in number of steps - 'train_iterations', int, number of training passes over a data batch - 'mlp_layers', int tuple, shape of the multilayer perceptron model - 'learning_rate', float, learning rate for the training of the model - 'std', float, standard deviation used for the Normal distribution of the policy - 'state_size', int - 'action_size', int - 'num_agents', int, number of agents running in parallel in the environment - 'policy', optional, used to pass a mock policy for testing purposes - 'critic', optional, used to pass a mock critic for testing purposes """ self.env = env self.brain_name = brain_name self.__dict__.update(config.as_dict()) self.policy = Policy(config, self.state_size, self.action_size) if policy is None else policy self.trained_critic = Critic( config, self.state_size) if critic is None else critic self.target_critic = Critic( config, self.state_size) if critic is None else critic self.target_critic.eval() self.memory = AgentMemory( ((self.num_agents, self.state_size), (self.num_agents, self.action_size), (self.num_agents, ), (self.num_agents, self.state_size), (self.num_agents, ), (self.num_agents, )), int(self.max_memory)) self.epsilon = config.epsilon_start self.scores = [] self.surrogates = [] self.optimizer = optim.Adam([{ 'params': self.policy.parameters() }, { 'params': self.trained_critic.parameters() }], lr=config.learning_rate) def solve(self): """ Main method to launch the environment loop """ step = 1 for i_episode in range(1, self.num_episodes + 1): env_info = self.env.reset(train_mode=True)[self.brain_name] state = env_info.vector_observations rewards = [] surrogates = [] while True: action, log_probability = self.act(state) env_info = self.env.step(action)[self.brain_name] next_state = env_info.vector_observations reward = env_info.rewards done = env_info.local_done self.memory.add( (state, action, log_probability, next_state, reward, done)) state = next_state rewards.append(reward) if not step % self.update_every: surrogate_buffer = self.train_loop() surrogates.append(surrogate_buffer) step += 1 if np.any(done): break self.scores.append(np.mean(np.sum(rewards, axis=0))) self.surrogates.append(np.mean(surrogates)) self.epsilon *= self.epsilon_decay self.print_status(i_episode) return self.scores, self.surrogates def act(self, states): """ Based on states, returns the on-policy actions Parameter --------- states - float array shape=(num_agents, state_size) Return --------- Float array shape=(num_agents, action_size), chosen action """ states = torch.from_numpy(states).float().to(device) self.policy.eval() actions, log_probabilities = self.policy.next_actions(states) return actions.cpu().data.numpy(), log_probabilities.cpu().data.numpy() def train_loop(self): """ Training routine to update the policy and critic """ surrogate_buffer = [] states, actions, old_log_probabilities, next_states, rewards, dones = self.memory.get_latest( self.update_every) future_rewards = self.compute_discounted_future_rewards(rewards) old_log_probabilities = torch.from_numpy( old_log_probabilities).float().to(device) states = torch.from_numpy(states).float().to(device) actions = torch.from_numpy(actions).float().to(device) next_states = torch.from_numpy(next_states).float().to(device) future_rewards = torch.from_numpy(future_rewards).float().to(device) dones = torch.from_numpy(dones).bool().to(device) self.policy.train() self.trained_critic.train() for _ in range(self.train_iterations): surrogate = self.compute_surrogate(old_log_probabilities, states, actions, next_states, future_rewards, dones) surrogate_buffer.append(surrogate.cpu().data.numpy()) self.optimizer.zero_grad() surrogate.backward() self.optimizer.step() self.target_network_update() return surrogate_buffer def compute_surrogate(self, old_log_probabilities, states, actions, next_states, future_rewards, dones): """ Compute the surrogate, i.e. the function optimized at training time Parameters ---------- - old_log_probabilities, float Tensor shape=(batch_size, num_agents), original probabilities for the performed action - states, float Tensor shape=(batch_size, num_agents, state_size) - actions, float Tensor shape=(batch_size, num_agents, action_size) - next_states, float Tensor shape=(batch_size, num_agents, state_size) - future_rewards, float Tensor shape=(batch_size, num_agents), discounted sum of future rewards over the length of the trajectory - dones, float Tensor shape=(batch_size, num_agents) Return --------- Surrogate, float Tensor """ new_log_probabilities, entropy = self.policy.get_log_probabilities_and_entropy( states, actions) ratio = torch.exp(new_log_probabilities - old_log_probabilities) with torch.no_grad(): states_values = self.target_critic(states) next_states_values = self.target_critic(next_states[-1, :]) if torch.any(dones): final_states_values = 0 else: final_states_values = next_states_values.expand( states_values.shape) future_rewards = self.normalize(future_rewards) discount = self.gamma**torch.arange(len(states_values), 0, -1, dtype=torch.float).unsqueeze(1) target_states_values = future_rewards + final_states_values * discount advantages = target_states_values - states_values clip = torch.clamp(ratio, 1 - self.epsilon, 1 + self.epsilon) clipped_surrogate = torch.min(ratio * advantages, clip * advantages) return -1 * torch.mean( clipped_surrogate) + 0.5 * self.trained_critic.mse( states_values, target_states_values) - 0.01 * entropy.mean() def normalize(self, a): """ Normalize a torch Tensor Parameters ---------- - a, float Tensor to normalize """ mean = torch.mean(a, -1) std = torch.std(a, -1) b = a mask = std != 0 b[mask] = (a[mask] - mean[mask].unsqueeze(1)) / std[mask].unsqueeze(1) # if the deviation is null set the normalized reward to 0 mask = std == 0 b[mask] = 0 return b def compute_discounted_future_rewards(self, rewards): """ Compute the discounted sum of future reward over the trajectory Parameters ---------- - rewards, float array shape=(batch_size, num_agents) Return ---------- Discounted future rewards, float array shape=(batch_size, num_agents) """ # This is complex so giving an example with gamma = 0.5 and # rewards = [[1, 0], # [1, 1]] main_dim = len(rewards) # discounts = [1, 0.5] discounts = (self.gamma**np.arange(main_dim)) # discounts = [[1, 0.5], # [1, 0.5]] discounts = np.tile(discounts, main_dim).reshape(main_dim, main_dim) # indexes = [[0, 1], # [1, 2]] indexes = np.tile(np.arange(main_dim), main_dim).reshape( main_dim, main_dim) + np.arange(main_dim)[:, np.newaxis] # indexes = [[0, 1], # [1, 0]] indexes = np.mod(indexes, main_dim) # discounts = [[1, 0.5], # [0, 1]] discounts = np.triu(discounts[range(main_dim), indexes]) # rewards = [[1.5, 0.5], # [1, 1]] return np.dot(discounts, rewards) def target_network_update(self): """ Performs a soft update with rate tau from the trained_model to the target_model. """ target_model_weights = self.target_critic.get_weights() train_model_weights = self.trained_critic.get_weights() new_weights = [] for w1, w2 in zip(target_model_weights, train_model_weights): new_weights.append(w1 * (1 - self.tau) + w2 * self.tau) self.target_critic.set_weights(new_weights) def print_status(self, i_episode): """ Print the latest status of the agent Parameter --------- i_episode, int """ print( "\rEpisode %d/%d | Average Score: %.2f | Model surrogate: %.5f " % (i_episode, self.num_episodes, self.scores[-1], self.surrogates[-1]), end="") sys.stdout.flush()