def test_agentbufferfield(): # Test constructor a = AgentBufferField([0, 1, 2]) for i, num in enumerate(a): assert num == i # Test indexing assert a[i] == num # Test slicing b = a[1:3] assert b == [1, 2] assert isinstance(b, AgentBufferField) # Test padding c = AgentBufferField() for _ in range(2): c.append([np.array(1), np.array(2)]) for _ in range(2): c.append([np.array(1)]) padded = c.padded_to_batch(pad_value=3) assert np.array_equal(padded[0], np.array([1, 1, 1, 1])) assert np.array_equal(padded[1], np.array([2, 2, 3, 3])) # Make sure it doesn't fail when the field isn't a list padded_a = a.padded_to_batch() assert np.array_equal(padded_a, a)
def _evaluate_by_sequence( self, tensor_obs: List[torch.Tensor], initial_memory: np.ndarray ) -> Tuple[Dict[str, torch.Tensor], AgentBufferField, torch.Tensor]: """ Evaluate a trajectory sequence-by-sequence, assembling the result. This enables us to get the intermediate memories for the critic. :param tensor_obs: A List of tensors of shape (trajectory_len, <obs_dim>) that are the agent's observations for this trajectory. :param initial_memory: The memory that preceeds this trajectory. Of shape (1,1,<mem_size>), i.e. what is returned as the output of a MemoryModules. :return: A Tuple of the value estimates as a Dict of [name, tensor], an AgentBufferField of the initial memories to be used during value function update, and the final memory at the end of the trajectory. """ num_experiences = tensor_obs[0].shape[0] all_next_memories = AgentBufferField() # In the buffer, the 1st sequence are the ones that are padded. So if seq_len = 3 and # trajectory is of length 10, the 1st sequence is [pad,pad,obs]. # Compute the number of elements in this padded seq. leftover = num_experiences % self.policy.sequence_length # Compute values for the potentially truncated initial sequence seq_obs = [] first_seq_len = self.policy.sequence_length for _obs in tensor_obs: if leftover > 0: first_seq_len = leftover first_seq_obs = _obs[0:first_seq_len] seq_obs.append(first_seq_obs) # For the first sequence, the initial memory should be the one at the # beginning of this trajectory. for _ in range(first_seq_len): all_next_memories.append(ModelUtils.to_numpy(initial_memory.squeeze())) init_values, _mem = self.critic.critic_pass( seq_obs, initial_memory, sequence_length=first_seq_len ) all_values = { signal_name: [init_values[signal_name]] for signal_name in init_values.keys() } # Evaluate other trajectories, carrying over _mem after each # trajectory for seq_num in range( 1, math.ceil((num_experiences) / (self.policy.sequence_length)) ): seq_obs = [] for _ in range(self.policy.sequence_length): all_next_memories.append(ModelUtils.to_numpy(_mem.squeeze())) for _obs in tensor_obs: start = seq_num * self.policy.sequence_length - ( self.policy.sequence_length - leftover ) end = (seq_num + 1) * self.policy.sequence_length - ( self.policy.sequence_length - leftover ) seq_obs.append(_obs[start:end]) values, _mem = self.critic.critic_pass( seq_obs, _mem, sequence_length=self.policy.sequence_length ) for signal_name, _val in values.items(): all_values[signal_name].append(_val) # Create one tensor per reward signal all_value_tensors = { signal_name: torch.cat(value_list, dim=0) for signal_name, value_list in all_values.items() } next_mem = _mem return all_value_tensors, all_next_memories, next_mem
def _evaluate_by_sequence_team( self, self_obs: List[torch.Tensor], obs: List[List[torch.Tensor]], actions: List[AgentAction], init_value_mem: torch.Tensor, init_baseline_mem: torch.Tensor, ) -> Tuple[Dict[str, torch.Tensor], Dict[ str, torch.Tensor], AgentBufferField, AgentBufferField, torch.Tensor, torch.Tensor, ]: """ Evaluate a trajectory sequence-by-sequence, assembling the result. This enables us to get the intermediate memories for the critic. :param tensor_obs: A List of tensors of shape (trajectory_len, <obs_dim>) that are the agent's observations for this trajectory. :param initial_memory: The memory that preceeds this trajectory. Of shape (1,1,<mem_size>), i.e. what is returned as the output of a MemoryModules. :return: A Tuple of the value estimates as a Dict of [name, tensor], an AgentBufferField of the initial memories to be used during value function update, and the final memory at the end of the trajectory. """ num_experiences = self_obs[0].shape[0] all_next_value_mem = AgentBufferField() all_next_baseline_mem = AgentBufferField() # In the buffer, the 1st sequence are the ones that are padded. So if seq_len = 3 and # trajectory is of length 10, the 1st sequence is [pad,pad,obs]. # Compute the number of elements in this padded seq. leftover = num_experiences % self.policy.sequence_length # Compute values for the potentially truncated initial sequence first_seq_len = leftover if leftover > 0 else self.policy.sequence_length self_seq_obs = [] groupmate_seq_obs = [] groupmate_seq_act = [] seq_obs = [] for _self_obs in self_obs: first_seq_obs = _self_obs[0:first_seq_len] seq_obs.append(first_seq_obs) self_seq_obs.append(seq_obs) for groupmate_obs, groupmate_action in zip(obs, actions): seq_obs = [] for _obs in groupmate_obs: first_seq_obs = _obs[0:first_seq_len] seq_obs.append(first_seq_obs) groupmate_seq_obs.append(seq_obs) _act = groupmate_action.slice(0, first_seq_len) groupmate_seq_act.append(_act) # For the first sequence, the initial memory should be the one at the # beginning of this trajectory. for _ in range(first_seq_len): all_next_value_mem.append( ModelUtils.to_numpy(init_value_mem.squeeze())) all_next_baseline_mem.append( ModelUtils.to_numpy(init_baseline_mem.squeeze())) all_seq_obs = self_seq_obs + groupmate_seq_obs init_values, _value_mem = self.critic.critic_pass( all_seq_obs, init_value_mem, sequence_length=first_seq_len) all_values = { signal_name: [init_values[signal_name]] for signal_name in init_values.keys() } groupmate_obs_and_actions = (groupmate_seq_obs, groupmate_seq_act) init_baseline, _baseline_mem = self.critic.baseline( self_seq_obs[0], groupmate_obs_and_actions, init_baseline_mem, sequence_length=first_seq_len, ) all_baseline = { signal_name: [init_baseline[signal_name]] for signal_name in init_baseline.keys() } # Evaluate other trajectories, carrying over _mem after each # trajectory for seq_num in range( 1, math.ceil( (num_experiences) / (self.policy.sequence_length))): for _ in range(self.policy.sequence_length): all_next_value_mem.append( ModelUtils.to_numpy(_value_mem.squeeze())) all_next_baseline_mem.append( ModelUtils.to_numpy(_baseline_mem.squeeze())) start = seq_num * self.policy.sequence_length - ( self.policy.sequence_length - leftover) end = (seq_num + 1) * self.policy.sequence_length - ( self.policy.sequence_length - leftover) self_seq_obs = [] groupmate_seq_obs = [] groupmate_seq_act = [] seq_obs = [] for _self_obs in self_obs: seq_obs.append(_obs[start:end]) self_seq_obs.append(seq_obs) for groupmate_obs, team_action in zip(obs, actions): seq_obs = [] for (_obs, ) in groupmate_obs: first_seq_obs = _obs[start:end] seq_obs.append(first_seq_obs) groupmate_seq_obs.append(seq_obs) _act = team_action.slice(start, end) groupmate_seq_act.append(_act) all_seq_obs = self_seq_obs + groupmate_seq_obs values, _value_mem = self.critic.critic_pass( all_seq_obs, _value_mem, sequence_length=self.policy.sequence_length) all_values = { signal_name: [init_values[signal_name]] for signal_name in values.keys() } groupmate_obs_and_actions = (groupmate_seq_obs, groupmate_seq_act) baselines, _baseline_mem = self.critic.baseline( self_seq_obs[0], groupmate_obs_and_actions, _baseline_mem, sequence_length=first_seq_len, ) all_baseline = { signal_name: [baselines[signal_name]] for signal_name in baselines.keys() } # Create one tensor per reward signal all_value_tensors = { signal_name: torch.cat(value_list, dim=0) for signal_name, value_list in all_values.items() } all_baseline_tensors = { signal_name: torch.cat(baseline_list, dim=0) for signal_name, baseline_list in all_baseline.items() } next_value_mem = _value_mem next_baseline_mem = _baseline_mem return ( all_value_tensors, all_baseline_tensors, all_next_value_mem, all_next_baseline_mem, next_value_mem, next_baseline_mem, )
def _evaluate_by_sequence_team( self, self_obs: List[torch.Tensor], obs: List[List[torch.Tensor]], actions: List[AgentAction], init_value_mem: torch.Tensor, init_baseline_mem: torch.Tensor, ) -> Tuple[Dict[str, torch.Tensor], Dict[ str, torch.Tensor], AgentBufferField, AgentBufferField, torch.Tensor, torch.Tensor, ]: """ Evaluate a trajectory sequence-by-sequence, assembling the result. This enables us to get the intermediate memories for the critic. :param tensor_obs: A List of tensors of shape (trajectory_len, <obs_dim>) that are the agent's observations for this trajectory. :param initial_memory: The memory that preceeds this trajectory. Of shape (1,1,<mem_size>), i.e. what is returned as the output of a MemoryModules. :return: A Tuple of the value estimates as a Dict of [name, tensor], an AgentBufferField of the initial memories to be used during value function update, and the final memory at the end of the trajectory. """ num_experiences = self_obs[0].shape[0] all_next_value_mem = AgentBufferField() all_next_baseline_mem = AgentBufferField() # When using LSTM, we need to divide the trajectory into sequences of equal length. Sometimes, # that division isn't even, and we must pad the leftover sequence. # In the buffer, the last sequence are the ones that are padded. So if seq_len = 3 and # trajectory is of length 10, the last sequence is [obs,pad,pad]. # Compute the number of elements in this padded seq. leftover_seq_len = num_experiences % self.policy.sequence_length all_values: Dict[str, List[np.ndarray]] = defaultdict(list) all_baseline: Dict[str, List[np.ndarray]] = defaultdict(list) _baseline_mem = init_baseline_mem _value_mem = init_value_mem # Evaluate other trajectories, carrying over _mem after each # trajectory for seq_num in range(num_experiences // self.policy.sequence_length): for _ in range(self.policy.sequence_length): all_next_value_mem.append( ModelUtils.to_numpy(_value_mem.squeeze())) all_next_baseline_mem.append( ModelUtils.to_numpy(_baseline_mem.squeeze())) start = seq_num * self.policy.sequence_length end = (seq_num + 1) * self.policy.sequence_length self_seq_obs = [] groupmate_seq_obs = [] groupmate_seq_act = [] seq_obs = [] for _self_obs in self_obs: seq_obs.append(_self_obs[start:end]) self_seq_obs.append(seq_obs) for groupmate_obs, groupmate_action in zip(obs, actions): seq_obs = [] for _obs in groupmate_obs: sliced_seq_obs = _obs[start:end] seq_obs.append(sliced_seq_obs) groupmate_seq_obs.append(seq_obs) _act = groupmate_action.slice(start, end) groupmate_seq_act.append(_act) all_seq_obs = self_seq_obs + groupmate_seq_obs values, _value_mem = self.critic.critic_pass( all_seq_obs, _value_mem, sequence_length=self.policy.sequence_length) for signal_name, _val in values.items(): all_values[signal_name].append(_val) groupmate_obs_and_actions = (groupmate_seq_obs, groupmate_seq_act) baselines, _baseline_mem = self.critic.baseline( self_seq_obs[0], groupmate_obs_and_actions, _baseline_mem, sequence_length=self.policy.sequence_length, ) for signal_name, _val in baselines.items(): all_baseline[signal_name].append(_val) # Compute values for the potentially truncated initial sequence if leftover_seq_len > 0: self_seq_obs = [] groupmate_seq_obs = [] groupmate_seq_act = [] seq_obs = [] for _self_obs in self_obs: last_seq_obs = _self_obs[-leftover_seq_len:] seq_obs.append(last_seq_obs) self_seq_obs.append(seq_obs) for groupmate_obs, groupmate_action in zip(obs, actions): seq_obs = [] for _obs in groupmate_obs: last_seq_obs = _obs[-leftover_seq_len:] seq_obs.append(last_seq_obs) groupmate_seq_obs.append(seq_obs) _act = groupmate_action.slice( len(_obs) - leftover_seq_len, len(_obs)) groupmate_seq_act.append(_act) # For the last sequence, the initial memory should be the one at the # beginning of this trajectory. seq_obs = [] for _ in range(leftover_seq_len): all_next_value_mem.append( ModelUtils.to_numpy(_value_mem.squeeze())) all_next_baseline_mem.append( ModelUtils.to_numpy(_baseline_mem.squeeze())) all_seq_obs = self_seq_obs + groupmate_seq_obs last_values, _value_mem = self.critic.critic_pass( all_seq_obs, _value_mem, sequence_length=leftover_seq_len) for signal_name, _val in last_values.items(): all_values[signal_name].append(_val) groupmate_obs_and_actions = (groupmate_seq_obs, groupmate_seq_act) last_baseline, _baseline_mem = self.critic.baseline( self_seq_obs[0], groupmate_obs_and_actions, _baseline_mem, sequence_length=leftover_seq_len, ) for signal_name, _val in last_baseline.items(): all_baseline[signal_name].append(_val) # Create one tensor per reward signal all_value_tensors = { signal_name: torch.cat(value_list, dim=0) for signal_name, value_list in all_values.items() } all_baseline_tensors = { signal_name: torch.cat(baseline_list, dim=0) for signal_name, baseline_list in all_baseline.items() } next_value_mem = _value_mem next_baseline_mem = _baseline_mem return ( all_value_tensors, all_baseline_tensors, all_next_value_mem, all_next_baseline_mem, next_value_mem, next_baseline_mem, )
def _evaluate_by_sequence( self, tensor_obs: List[torch.Tensor], initial_memory: torch.Tensor ) -> Tuple[Dict[str, torch.Tensor], AgentBufferField, torch.Tensor]: """ Evaluate a trajectory sequence-by-sequence, assembling the result. This enables us to get the intermediate memories for the critic. :param tensor_obs: A List of tensors of shape (trajectory_len, <obs_dim>) that are the agent's observations for this trajectory. :param initial_memory: The memory that preceeds this trajectory. Of shape (1,1,<mem_size>), i.e. what is returned as the output of a MemoryModules. :return: A Tuple of the value estimates as a Dict of [name, tensor], an AgentBufferField of the initial memories to be used during value function update, and the final memory at the end of the trajectory. """ num_experiences = tensor_obs[0].shape[0] all_next_memories = AgentBufferField() # When using LSTM, we need to divide the trajectory into sequences of equal length. Sometimes, # that division isn't even, and we must pad the leftover sequence. # When it is added to the buffer, the last sequence will be padded. So if seq_len = 3 and # trajectory is of length 10, the last sequence is [obs,pad,pad] once it is added to the buffer. # Compute the number of elements in this sequence that will end up being padded. leftover_seq_len = num_experiences % self.policy.sequence_length all_values: Dict[str, List[np.ndarray]] = defaultdict(list) _mem = initial_memory # Evaluate other trajectories, carrying over _mem after each # trajectory for seq_num in range(num_experiences // self.policy.sequence_length): seq_obs = [] for _ in range(self.policy.sequence_length): all_next_memories.append(ModelUtils.to_numpy(_mem.squeeze())) start = seq_num * self.policy.sequence_length end = (seq_num + 1) * self.policy.sequence_length for _obs in tensor_obs: seq_obs.append(_obs[start:end]) values, _mem = self.critic.critic_pass( seq_obs, _mem, sequence_length=self.policy.sequence_length) for signal_name, _val in values.items(): all_values[signal_name].append(_val) # Compute values for the potentially truncated last sequence. Note that this # sequence isn't padded yet, but will be. seq_obs = [] if leftover_seq_len > 0: for _obs in tensor_obs: last_seq_obs = _obs[-leftover_seq_len:] seq_obs.append(last_seq_obs) # For the last sequence, the initial memory should be the one at the # end of this trajectory. for _ in range(leftover_seq_len): all_next_memories.append(ModelUtils.to_numpy(_mem.squeeze())) last_values, _mem = self.critic.critic_pass( seq_obs, _mem, sequence_length=leftover_seq_len) for signal_name, _val in last_values.items(): all_values[signal_name].append(_val) # Create one tensor per reward signal all_value_tensors = { signal_name: torch.cat(value_list, dim=0) for signal_name, value_list in all_values.items() } next_mem = _mem return all_value_tensors, all_next_memories, next_mem