def test_buffer(): agent_1_buffer = construct_fake_buffer(1) agent_2_buffer = construct_fake_buffer(2) agent_3_buffer = construct_fake_buffer(3) a = agent_1_buffer[ObsUtil.get_name_at(0)].get_batch(batch_size=2, training_length=1, sequential=True) assert_array(np.array(a), np.array([[171, 172, 173], [181, 182, 183]])) a = agent_2_buffer[ObsUtil.get_name_at(0)].get_batch(batch_size=2, training_length=3, sequential=True) assert_array( np.array(a), np.array([ [231, 232, 233], [241, 242, 243], [251, 252, 253], [261, 262, 263], [271, 272, 273], [281, 282, 283], ]), ) a = agent_2_buffer[ObsUtil.get_name_at(0)].get_batch(batch_size=2, training_length=3, sequential=False) assert_array( np.array(a), np.array([ [251, 252, 253], [261, 262, 263], [271, 272, 273], [261, 262, 263], [271, 272, 273], [281, 282, 283], ]), ) agent_1_buffer.reset_agent() assert agent_1_buffer.num_experiences == 0 update_buffer = AgentBuffer() agent_2_buffer.resequence_and_append(update_buffer, batch_size=None, training_length=2) agent_3_buffer.resequence_and_append(update_buffer, batch_size=None, training_length=2) assert len(update_buffer[BufferKey.CONTINUOUS_ACTION]) == 20 assert np.array(update_buffer[BufferKey.CONTINUOUS_ACTION]).shape == (20, 2) c = update_buffer.make_mini_batch(start=0, end=1) assert c.keys() == update_buffer.keys() assert np.array(c[BufferKey.CONTINUOUS_ACTION]).shape == (1, 2)
def create_agent_buffer(behavior_spec: BehaviorSpec, number: int, reward: float = 0.0) -> AgentBuffer: buffer = AgentBuffer() curr_obs = [ np.random.normal(size=sen_spec.shape).astype(np.float32) for sen_spec in behavior_spec.sensor_specs ] next_obs = [ np.random.normal(size=sen_spec.shape).astype(np.float32) for sen_spec in behavior_spec.sensor_specs ] action_buffer = behavior_spec.action_spec.random_action(1) action = {} if behavior_spec.action_spec.continuous_size > 0: action["continuous_action"] = action_buffer.continuous if behavior_spec.action_spec.discrete_size > 0: action["discrete_action"] = action_buffer.discrete for _ in range(number): for i, obs in enumerate(curr_obs): buffer[ObsUtil.get_name_at(i)].append(obs) for i, obs in enumerate(next_obs): buffer[ObsUtil.get_name_at_next(i)].append(obs) buffer["actions"].append(action) for _act_type, _act in action.items(): buffer[_act_type].append(_act[0, :]) buffer["reward"].append(np.ones(1, dtype=np.float32) * reward) buffer["masks"].append(np.ones(1, dtype=np.float32)) buffer["done"] = np.zeros(number, dtype=np.float32) return buffer
def construct_fake_buffer(fake_agent_id): b = AgentBuffer() for step in range(9): b[ObsUtil.get_name_at(0)].append( np.array( [ 100 * fake_agent_id + 10 * step + 1, 100 * fake_agent_id + 10 * step + 2, 100 * fake_agent_id + 10 * step + 3, ], dtype=np.float32, )) b[BufferKey.CONTINUOUS_ACTION].append( np.array( [ 100 * fake_agent_id + 10 * step + 4, 100 * fake_agent_id + 10 * step + 5, ], dtype=np.float32, )) b[BufferKey.GROUP_CONTINUOUS_ACTION].append([ np.array( [ 100 * fake_agent_id + 10 * step + 4, 100 * fake_agent_id + 10 * step + 5, ], dtype=np.float32, ) ] * 3) return b
def create_agent_buffer(behavior_spec: BehaviorSpec, number: int, reward: float = 0.0) -> AgentBuffer: buffer = AgentBuffer() curr_obs = [ np.random.normal(size=obs_spec.shape).astype(np.float32) for obs_spec in behavior_spec.observation_specs ] next_obs = [ np.random.normal(size=obs_spec.shape).astype(np.float32) for obs_spec in behavior_spec.observation_specs ] action_buffer = behavior_spec.action_spec.random_action(1) action = {} if behavior_spec.action_spec.continuous_size > 0: action[BufferKey.CONTINUOUS_ACTION] = action_buffer.continuous if behavior_spec.action_spec.discrete_size > 0: action[BufferKey.DISCRETE_ACTION] = action_buffer.discrete for _ in range(number): for i, obs in enumerate(curr_obs): buffer[ObsUtil.get_name_at(i)].append(obs) for i, obs in enumerate(next_obs): buffer[ObsUtil.get_name_at_next(i)].append(obs) # TODO # buffer[AgentBufferKey.ACTIONS].append(action) for _act_type, _act in action.items(): buffer[_act_type].append(_act[0, :]) # TODO was "rewards" buffer[BufferKey.ENVIRONMENT_REWARDS].append( np.ones(1, dtype=np.float32) * reward) buffer[BufferKey.MASKS].append(np.ones(1, dtype=np.float32)) buffer[BufferKey.DONE] = np.zeros(number, dtype=np.float32) return buffer
def construct_fake_buffer(fake_agent_id): b = AgentBuffer() for step in range(9): b[ObsUtil.get_name_at(0)].append([ 100 * fake_agent_id + 10 * step + 1, 100 * fake_agent_id + 10 * step + 2, 100 * fake_agent_id + 10 * step + 3, ]) b[BufferKey.CONTINUOUS_ACTION].append([ 100 * fake_agent_id + 10 * step + 4, 100 * fake_agent_id + 10 * step + 5 ]) return b
def make_demo_buffer( pair_infos: List[AgentInfoActionPairProto], behavior_spec: BehaviorSpec, sequence_length: int, ) -> AgentBuffer: # Create and populate buffer using experiences demo_raw_buffer = AgentBuffer() demo_processed_buffer = AgentBuffer() for idx, current_pair_info in enumerate(pair_infos): if idx > len(pair_infos) - 2: break next_pair_info = pair_infos[idx + 1] current_decision_step, current_terminal_step = steps_from_proto( [current_pair_info.agent_info], behavior_spec ) next_decision_step, next_terminal_step = steps_from_proto( [next_pair_info.agent_info], behavior_spec ) previous_action = ( np.array( pair_infos[idx].action_info.vector_actions_deprecated, dtype=np.float32 ) * 0 ) if idx > 0: previous_action = np.array( pair_infos[idx - 1].action_info.vector_actions_deprecated, dtype=np.float32, ) next_done = len(next_terminal_step) == 1 next_reward = 0 if len(next_terminal_step) == 1: next_reward = next_terminal_step.reward[0] else: next_reward = next_decision_step.reward[0] current_obs = None if len(current_terminal_step) == 1: current_obs = list(current_terminal_step.values())[0].obs else: current_obs = list(current_decision_step.values())[0].obs demo_raw_buffer["done"].append(next_done) demo_raw_buffer["rewards"].append(next_reward) for i, obs in enumerate(current_obs): demo_raw_buffer[ObsUtil.get_name_at(i)].append(obs) if ( len(current_pair_info.action_info.continuous_actions) == 0 and len(current_pair_info.action_info.discrete_actions) == 0 ): if behavior_spec.action_spec.continuous_size > 0: demo_raw_buffer["continuous_action"].append( current_pair_info.action_info.vector_actions_deprecated ) else: demo_raw_buffer["discrete_action"].append( current_pair_info.action_info.vector_actions_deprecated ) else: if behavior_spec.action_spec.continuous_size > 0: demo_raw_buffer["continuous_action"].append( current_pair_info.action_info.continuous_actions ) if behavior_spec.action_spec.discrete_size > 0: demo_raw_buffer["discrete_action"].append( current_pair_info.action_info.discrete_actions ) demo_raw_buffer["prev_action"].append(previous_action) if next_done: demo_raw_buffer.resequence_and_append( demo_processed_buffer, batch_size=None, training_length=sequence_length ) demo_raw_buffer.reset_agent() demo_raw_buffer.resequence_and_append( demo_processed_buffer, batch_size=None, training_length=sequence_length ) return demo_processed_buffer
def test_buffer(): agent_1_buffer = construct_fake_buffer(1) agent_2_buffer = construct_fake_buffer(2) agent_3_buffer = construct_fake_buffer(3) # Test get_batch a = agent_1_buffer[ObsUtil.get_name_at(0)].get_batch(batch_size=2, training_length=1, sequential=True) assert_array( np.array(a), np.array([[171, 172, 173], [181, 182, 183]], dtype=np.float32)) # Test get_batch a = agent_2_buffer[ObsUtil.get_name_at(0)].get_batch(batch_size=2, training_length=3, sequential=True) assert_array( np.array(a), np.array( [ [231, 232, 233], [241, 242, 243], [251, 252, 253], [261, 262, 263], [271, 272, 273], [281, 282, 283], ], dtype=np.float32, ), ) a = agent_2_buffer[ObsUtil.get_name_at(0)].get_batch(batch_size=2, training_length=3, sequential=False) assert_array( np.array(a), np.array([ [251, 252, 253], [261, 262, 263], [271, 272, 273], [261, 262, 263], [271, 272, 273], [281, 282, 283], ]), ) # Test padding a = agent_2_buffer[ObsUtil.get_name_at(0)].get_batch(batch_size=None, training_length=4, sequential=True) assert_array( np.array(a), np.array([ [201, 202, 203], [211, 212, 213], [221, 222, 223], [231, 232, 233], [241, 242, 243], [251, 252, 253], [261, 262, 263], [271, 272, 273], [281, 282, 283], [0, 0, 0], [0, 0, 0], [0, 0, 0], ]), ) # Test group entries return Lists of Lists. Make sure to pad properly! a = agent_2_buffer[BufferKey.GROUP_CONTINUOUS_ACTION].get_batch( batch_size=None, training_length=4, sequential=True) for _group_entry in a[:-3]: assert len(_group_entry) == 3 for _group_entry in a[-3:]: assert len(_group_entry) == 0 agent_1_buffer.reset_agent() assert agent_1_buffer.num_experiences == 0 update_buffer = AgentBuffer() agent_2_buffer.resequence_and_append(update_buffer, batch_size=None, training_length=2) agent_3_buffer.resequence_and_append(update_buffer, batch_size=None, training_length=2) assert len(update_buffer[BufferKey.CONTINUOUS_ACTION]) == 20 assert np.array(update_buffer[BufferKey.CONTINUOUS_ACTION]).shape == (20, 2) c = update_buffer.make_mini_batch(start=0, end=1) assert c.keys() == update_buffer.keys() # Make sure the values of c are AgentBufferField for val in c.values(): assert isinstance(val, AgentBufferField) assert np.array(c[BufferKey.CONTINUOUS_ACTION]).shape == (1, 2)