def test_compare_discrete_action_diff(observation_shape, action_size, n_episodes, episode_length): episodes = [] for _ in range(n_episodes): observations = np.random.random((episode_length, ) + observation_shape) actions = np.random.random((episode_length, action_size)) rewards = np.random.random((episode_length, 1)) episode = Episode( observation_shape, action_size, observations.astype("f4"), actions, rewards, ) episodes.append(episode) A1 = np.random.random(observation_shape + (action_size, )) A2 = np.random.random(observation_shape + (action_size, )) algo = DummyAlgo(A1, 0.0, discrete=True) base_algo = DummyAlgo(A2, 0.0, discrete=True) total_matches = [] for episode in episodes: batch = TransitionMiniBatch(episode.transitions) actions = algo.predict(batch.observations) base_actions = base_algo.predict(batch.observations) match = (actions == base_actions).tolist() total_matches += match score = compare_discrete_action_match(base_algo)(algo, episodes) assert np.allclose(score, np.mean(total_matches))
def test_compare_continuous_action_diff(observation_shape, action_size, n_episodes, episode_length): episodes = [] for _ in range(n_episodes): observations = np.random.random((episode_length, ) + observation_shape) actions = np.random.random((episode_length, action_size)) rewards = np.random.random((episode_length, 1)) episode = Episode( observation_shape, action_size, observations.astype("f4"), actions, rewards, ) episodes.append(episode) A1 = np.random.random(observation_shape + (action_size, )) A2 = np.random.random(observation_shape + (action_size, )) algo = DummyAlgo(A1, 0.0) base_algo = DummyAlgo(A2, 0.0) total_diffs = [] for episode in episodes: batch = TransitionMiniBatch(episode.transitions) actions = algo.predict(batch.observations) base_actions = base_algo.predict(batch.observations) diff = ((actions - base_actions)**2).sum(axis=1).tolist() total_diffs += diff score = compare_continuous_action_diff(base_algo)(algo, episodes) assert np.allclose(score, -np.mean(total_diffs))
def test_value_estimation_std_scorer(observation_shape, action_size, n_episodes, episode_length): # projection matrix for deterministic action A = np.random.random(observation_shape + (action_size, )) episodes = [] for _ in range(n_episodes): observations = np.random.random((episode_length, ) + observation_shape) actions = np.matmul(observations, A).astype("f4") rewards = np.random.random((episode_length, 1)).astype("f4") episode = Episode( observation_shape, action_size, observations.astype("f4"), actions, rewards, ) episodes.append(episode) algo = DummyAlgo(A, 0.0) total_stds = [] for episode in episodes: batch = TransitionMiniBatch(episode.transitions) policy_actions = algo.predict(batch.observations) _, stds = algo.predict_value(batch.observations, policy_actions, True) total_stds += stds.tolist() score = value_estimation_std_scorer(algo, episodes) assert np.allclose(score, -np.mean(total_stds))
def test_soft_opc_scorer(observation_shape, action_size, n_episodes, episode_length, threshold): # projection matrix for deterministic action A = np.random.random(observation_shape + (action_size, )) episodes = [] for _ in range(n_episodes): observations = np.random.random((episode_length, ) + observation_shape) actions = np.matmul(observations, A).astype('f4') rewards = np.random.random((episode_length, 1)).astype('f4') episode = Episode(observation_shape, action_size, observations.astype('f4'), actions, rewards) episodes.append(episode) algo = DummyAlgo(A, 0.0) success_values = [] all_values = [] for episode in episodes: is_success = episode.compute_return() >= threshold batch = TransitionMiniBatch(episode.transitions) values = algo.predict_value(batch.observations, batch.actions) if is_success: success_values += values.tolist() all_values += values.tolist() scorer = soft_opc_scorer(threshold) score = scorer(algo, episodes) assert np.allclose(score, np.mean(success_values) - np.mean(all_values))
def test_discounted_sum_of_advantage_scorer(observation_shape, action_size, n_episodes, episode_length, gamma): # projection matrix for deterministic action A = np.random.random(observation_shape + (action_size, )) episodes = [] for _ in range(n_episodes): observations = np.random.random((episode_length, ) + observation_shape) # make difference between algorithm outputs and dataset noise = 100 * np.random.random((episode_length, action_size)) actions = (np.matmul(observations, A) + noise).astype('f4') rewards = np.random.random((episode_length, 1)).astype('f4') episode = Episode(observation_shape, action_size, observations.astype('f4'), actions, rewards) episodes.append(episode) algo = DummyAlgo(A, gamma) ref_sums = [] for episode in episodes: batch = TransitionMiniBatch(episode.transitions) policy_actions = algo.predict(batch.observations) ref_sum = ref_discounted_sum_of_advantage_score( algo.predict_value, batch.observations, batch.actions, policy_actions, gamma) ref_sums += ref_sum score = discounted_sum_of_advantage_scorer(algo, episodes) assert np.allclose(score, -np.mean(ref_sums))
def test_continuous_action_diff_scorer(observation_shape, action_size, n_episodes, episode_length): # projection matrix for deterministic action A = np.random.random(observation_shape + (action_size, )) episodes = [] for _ in range(n_episodes): observations = np.random.random((episode_length, ) + observation_shape) actions = np.random.random((episode_length, action_size)).astype("f4") rewards = np.random.random((episode_length, 1)).astype("f4") episode = Episode( observation_shape, action_size, observations.astype("f4"), actions, rewards, ) episodes.append(episode) algo = DummyAlgo(A, 0.0) total_diffs = [] for episode in episodes: batch = TransitionMiniBatch(episode.transitions) policy_actions = algo.predict(batch.observations) diff = ((batch.actions - policy_actions)**2).sum(axis=1).tolist() total_diffs += diff score = continuous_action_diff_scorer(algo, episodes) assert np.allclose(score, -np.mean(total_diffs))
def test_dynamics_prediction_variance_scorer(observation_shape, action_size, n_episodes, episode_length): episodes = [] for _ in range(n_episodes): observations = np.random.random((episode_length, ) + observation_shape) actions = np.random.random((episode_length, action_size)) rewards = np.random.random((episode_length, 1)) episode = Episode( observation_shape, action_size, observations.astype("f4"), actions.astype("f4"), rewards.astype("f4"), ) episodes.append(episode) dynamics = DummyDynamics(np.random.random(observation_shape)) total_variances = [] for episode in episodes: batch = TransitionMiniBatch(episode.transitions) _, _, var = dynamics.predict(batch.observations, batch.actions, True) total_variances += var.tolist() score = dynamics_prediction_variance_scorer(dynamics, episodes) assert np.allclose(score, -np.mean(total_variances))
def test_dynamics_observation_prediction_error_scorer(observation_shape, action_size, n_episodes, episode_length): episodes = [] for _ in range(n_episodes): observations = np.random.random((episode_length, ) + observation_shape) actions = np.random.random((episode_length, action_size)).astype("f4") rewards = np.random.random((episode_length, 1)).astype("f4") episode = Episode( observation_shape, action_size, observations.astype("f4"), actions, rewards, ) episodes.append(episode) dynamics = DummyDynamics(np.random.random(observation_shape)) total_errors = [] for episode in episodes: batch = TransitionMiniBatch(episode.transitions) pred_x, _ = dynamics.predict(batch.observations, batch.actions) errors = ((batch.next_observations - pred_x)**2).sum(axis=1) total_errors += errors.tolist() score = dynamics_observation_prediction_error_scorer(dynamics, episodes) assert np.allclose(score, -np.mean(total_errors))
def base_update_tester(model, observation_shape, action_size, discrete=False): # make mini-batch transitions = [] for _ in range(model.batch_size): observation = np.random.random(observation_shape) next_observation = np.random.random(observation_shape) reward = np.random.random() next_reward = np.random.random() terminal = np.random.randint(2) returns = np.random.random(100) consequent_observations = np.random.random((100, *observation_shape)) if discrete: action = np.random.randint(action_size) next_action = np.random.randint(action_size) else: action = np.random.random(action_size) next_action = np.random.random(action_size) transition = Transition(observation_shape, action_size, observation, action, reward, next_observation, next_action, next_reward, terminal, returns, consequent_observations) transitions.append(transition) batch = TransitionMiniBatch(transitions) # check if update runs without errors model.create_impl(observation_shape, action_size) loss = model.update(0, 0, batch) assert len(loss) == len(model._get_loss_labels()) return transitions
def test_discrete_action_math_scorer(observation_shape, action_size, n_episodes, episode_length): # projection matrix for deterministic action A = np.random.random(observation_shape + (action_size, )) episodes = [] for _ in range(n_episodes): observations = np.random.random((episode_length, ) + observation_shape) actions = np.random.randint(action_size, size=episode_length) rewards = np.random.random((episode_length, 1)).astype("f4") episode = Episode( observation_shape, action_size, observations.astype("f4"), actions, rewards, ) episodes.append(episode) algo = DummyAlgo(A, 0.0, discrete=True) total_matches = [] for episode in episodes: batch = TransitionMiniBatch(episode.transitions) policy_actions = algo.predict(batch.observations) match = (batch.actions.reshape(-1) == policy_actions).tolist() total_matches += match score = discrete_action_match_scorer(algo, episodes) assert np.allclose(score, np.mean(total_matches))
def test_td_error_scorer(observation_shape, action_size, n_episodes, episode_length, gamma): # projection matrix for deterministic action A = np.random.random(observation_shape + (action_size, )) episodes = [] for _ in range(n_episodes): observations = np.random.random((episode_length, ) + observation_shape) actions = np.matmul(observations, A).astype('f4') rewards = np.random.random((episode_length, 1)).astype('f4') episode = Episode(observation_shape, action_size, observations.astype('f4'), actions, rewards) episodes.append(episode) algo = DummyAlgo(A, gamma) ref_errors = [] for episode in episodes: batch = TransitionMiniBatch(episode.transitions) ref_error = ref_td_error_score( algo.predict_value, batch.observations, batch.actions, np.asarray(batch.next_rewards).reshape(-1), batch.next_observations, batch.next_actions, np.asarray(batch.terminals).reshape(-1), gamma) ref_errors += ref_error score = td_error_scorer(algo, episodes) assert np.allclose(score, -np.mean(ref_errors))
def test_transition_minibatch(data_size, observation_size, action_size, gamma): observations = np.random.random((data_size, observation_size)) actions = np.random.random((data_size, action_size)) rewards = np.random.random((data_size, 1)) episode = Episode((observation_size, ), action_size, observations, actions, rewards, gamma) batch = TransitionMiniBatch(episode.transitions) for i, t in enumerate(episode.transitions): assert np.all(batch.observations[i] == t.observation) assert np.all(batch.actions[i] == t.action) assert np.all(batch.rewards[i] == t.reward) assert np.all(batch.next_observations[i] == t.next_observation) assert np.all(batch.next_actions[i] == t.next_action) assert np.all(batch.next_rewards[i] == t.next_reward) assert np.all(batch.terminals[i] == t.terminal) assert np.all(batch.returns[i] == t.returns) assert np.all( batch.consequent_observations[i] == t.consequent_observations) # check list-like behavior assert len(batch) == data_size - 1 assert batch[0] is episode.transitions[0] for i, transition in enumerate(batch): assert isinstance(transition, Transition) assert transition is episode.transitions[i]
def test_compute_lambda_return( data_size, observation_shape, action_size, n_frames, gamma, lam ): if len(observation_shape) == 3: observations = np.random.randint( 256, size=(data_size, *observation_shape), dtype=np.uint8 ) else: observations = np.random.random( (data_size,) + observation_shape ).astype("f4") actions = np.random.random((data_size, action_size)).astype("f4") rewards = np.random.random((data_size, 1)).astype("f4") episode = Episode( observation_shape=observation_shape, action_size=action_size, observations=observations, actions=actions, rewards=rewards, ) class DummyAlgo: def predict_value(self, observations): batch_size = observations.shape[0] return np.mean(observations.reshape((batch_size, -1)), axis=1) algo = DummyAlgo() transitions = episode.transitions transition = transitions[3] # compute reference naively t = transition observations = [] returns = [] R = 0.0 for i in range(data_size): observation = TransitionMiniBatch([t], n_frames).next_observations[0] observations.append(observation) R += (gamma ** i) * t.next_reward returns.append(R) t = t.next_transition if t is None: break values = algo.predict_value(np.array(observations)) values[-1] = 0.0 gammas = gamma ** (np.arange(len(observations)) + 1) returns += gammas * values lambdas = lam ** np.arange(len(observations)) ref_lambda_return = (1.0 - lam) * np.sum(lambdas[:-1] * returns[:-1]) ref_lambda_return += lambdas[-1] * returns[-1] # compute lambda return lambda_return = compute_lambda_return( transition, algo, gamma, lam, n_frames ) assert np.allclose(ref_lambda_return, lambda_return)
def base_update_tester(model, observation_shape, action_size, discrete=False): # make mini-batch transitions = [] prev_transition = None for i in range(model.batch_size): if len(observation_shape) == 3: observation = np.random.randint(256, size=observation_shape, dtype=np.uint8) next_observation = np.random.randint(256, size=observation_shape, dtype=np.uint8) else: observation = np.random.random(observation_shape).astype("f4") next_observation = np.random.random(observation_shape).astype("f4") reward = np.random.random() next_reward = np.random.random() terminal = np.random.randint(2) if discrete: action = np.random.randint(action_size) next_action = np.random.randint(action_size) else: action = np.random.random(action_size).astype("f4") next_action = np.random.random(action_size).astype("f4") transition = Transition( observation_shape=observation_shape, action_size=action_size, observation=observation, action=action, reward=reward, next_observation=next_observation, next_action=next_action, next_reward=next_reward, terminal=terminal, prev_transition=prev_transition, ) # set transition to the next pointer if prev_transition: prev_transition.next_transition = transition prev_transition = transition transitions.append(transition) batch = TransitionMiniBatch(transitions) # check if update runs without errors model.create_impl(observation_shape, action_size) loss = model.update(0, 0, batch) assert len(loss.items()) > 0 return transitions
def test_dynamics_reward_prediction_error_scorer( observation_shape, action_size, n_episodes, episode_length, reward_scaler, ): episodes = [] for _ in range(n_episodes): observations = np.random.random((episode_length, ) + observation_shape) actions = np.random.random((episode_length, action_size)).astype("f4") rewards = np.random.random((episode_length, 1)).astype("f4") episode = Episode( observation_shape, action_size, observations.astype("f4"), actions, rewards, ) episodes.append(episode) dynamics = DummyDynamics(np.random.random(observation_shape), reward_scaler) total_errors = [] for episode in episodes: batch = TransitionMiniBatch(episode.transitions) _, pred_reward = dynamics.predict(batch.observations, batch.actions) if reward_scaler: next_rewards = reward_scaler.transform_numpy(batch.next_rewards) else: next_rewards = batch.next_rewards errors = ((next_rewards - pred_reward)**2).reshape(-1) total_errors += errors.tolist() score = dynamics_reward_prediction_error_scorer(dynamics, episodes) assert np.allclose(score, -np.mean(total_errors))
def test_torch_api_with_batch( batch_size, observation_shape, action_size, use_scaler, use_action_scaler, use_reward_scaler, ): obs_shape = (batch_size,) + observation_shape transitions = [] for _ in range(batch_size): transition = Transition( observation_shape=observation_shape, action_size=action_size, observation=np.random.random(obs_shape), action=np.random.random(action_size), reward=np.random.random(), next_observation=np.random.random(obs_shape), next_action=np.random.random(action_size), next_reward=np.random.random(), terminal=0.0, ) transitions.append(transition) if use_scaler: class DummyScaler: def transform(self, x): return x + 0.1 scaler = DummyScaler() else: scaler = None if use_action_scaler: class DummyActionScaler: def transform(self, x): return x + 0.2 action_scaler = DummyActionScaler() else: action_scaler = None if use_reward_scaler: class DummyRewardScaler: def transform(self, x): return x + 0.2 reward_scaler = DummyRewardScaler() else: reward_scaler = None batch = TransitionMiniBatch(transitions) impl = DummyImpl() impl._scaler = scaler impl._action_scaler = action_scaler impl._reward_scaler = reward_scaler torch_batch = impl.torch_api_func_with_batch(batch) if use_scaler: assert np.all( torch_batch.observations.numpy() == batch.observations + 0.1 ) assert np.all( torch_batch.next_observations.numpy() == batch.next_observations + 0.1 ) else: assert np.all(torch_batch.observations.numpy() == batch.observations) assert np.all( torch_batch.next_observations.numpy() == batch.next_observations ) if use_action_scaler: assert np.all(torch_batch.actions.numpy() == batch.actions + 0.2) assert np.all( torch_batch.next_actions.numpy() == batch.next_actions + 0.2 ) else: assert np.all(torch_batch.actions.numpy() == batch.actions) assert np.all(torch_batch.next_actions.numpy() == batch.next_actions) if use_reward_scaler: assert np.all(torch_batch.rewards.numpy() == batch.rewards + 0.2) assert np.all( torch_batch.next_rewards.numpy() == batch.next_rewards + 0.2 ) else: assert np.all(torch_batch.rewards.numpy() == batch.rewards) assert np.all(torch_batch.next_rewards.numpy() == batch.next_rewards) assert np.all(torch_batch.terminals.numpy() == batch.terminals) assert np.all(torch_batch.n_steps.numpy() == batch.n_steps)
def test_transition_minibatch(data_size, observation_shape, action_size, n_frames, discrete_action): if len(observation_shape) == 3: observations = np.random.randint(256, size=(data_size, *observation_shape), dtype=np.uint8) else: observations = np.random.random((data_size, ) + observation_shape).astype('f4') if discrete_action: actions = np.random.randint(action_size, size=data_size) else: actions = np.random.random((data_size, action_size)).astype('f4') rewards = np.random.random((data_size, 1)).astype('f4') episode = Episode(observation_shape=observation_shape, action_size=action_size, observations=observations, actions=actions, rewards=rewards) if len(observation_shape) == 3: n_channels = n_frames * observation_shape[0] image_size = observation_shape[1:] batched_observation_shape = (data_size - 1, n_channels, *image_size) else: batched_observation_shape = (data_size - 1, *observation_shape) # create padded observations for check stacking padding = np.zeros((n_frames - 1, *observation_shape), dtype=np.uint8) padded_observations = np.vstack([padding, observations]) batch = TransitionMiniBatch(episode.transitions, n_frames) assert batch.observations.shape == batched_observation_shape assert batch.next_observations.shape == batched_observation_shape for i, t in enumerate(episode.transitions): observation = batch.observations[i] next_observation = batch.next_observations[i] if n_frames > 1 and len(observation_shape) == 3: # check frame stacking head_index = i tail_index = head_index + n_frames window = padded_observations[head_index:tail_index] next_window = padded_observations[head_index + 1:tail_index + 1] ref_observation = np.vstack(window) ref_next_observation = np.vstack(next_window) assert observation.shape == ref_observation.shape assert next_observation.shape == ref_next_observation.shape assert np.all(observation == ref_observation) assert np.all(next_observation == ref_next_observation) else: assert np.allclose(observation, t.observation) assert np.allclose(next_observation, t.next_observation) assert np.all(batch.actions[i] == t.action) assert np.all(batch.rewards[i][0] == t.reward) assert np.all(batch.next_actions[i] == t.next_action) assert np.all(batch.next_rewards[i][0] == t.next_reward) assert np.all(batch.terminals[i][0] == t.terminal) # check list-like behavior assert len(batch) == data_size - 1 assert batch[0] is episode.transitions[0] for i, transition in enumerate(batch): assert isinstance(transition, Transition) assert transition is episode.transitions[i]
def test_transition_minibatch( data_size, observation_shape, action_size, n_frames, n_steps, gamma, discrete_action, create_mask, mask_size, ): if len(observation_shape) == 3: observations = np.random.randint(256, size=(data_size, *observation_shape), dtype=np.uint8) else: observations = np.random.random((data_size, ) + observation_shape).astype("f4") if discrete_action: actions = np.random.randint(action_size, size=data_size) else: actions = np.random.random((data_size, action_size)).astype("f4") rewards = np.random.random((data_size, 1)).astype("f4") episode = Episode( observation_shape=observation_shape, action_size=action_size, observations=observations, actions=actions, rewards=rewards, create_mask=create_mask, mask_size=mask_size, ) if len(observation_shape) == 3: n_channels = n_frames * observation_shape[0] image_size = observation_shape[1:] batched_observation_shape = (data_size - 1, n_channels, *image_size) else: batched_observation_shape = (data_size - 1, *observation_shape) batch = TransitionMiniBatch(episode.transitions, n_frames, n_steps, gamma) assert batch.observations.shape == batched_observation_shape assert batch.next_observations.shape == batched_observation_shape for i, t in enumerate(episode.transitions): observation = batch.observations[i] next_observation = batch.next_observations[i] n = int(batch.n_steps[i][0]) assert n == min(data_size - i - 1, n_steps) if n_frames > 1 and len(observation_shape) == 3: # create padded observations for check stacking pad = ((n_frames - 1, 0), (0, 0), (0, 0), (0, 0)) padded_observations = np.pad(observations, pad, "edge") # check frame stacking head_index = i tail_index = head_index + n_frames window = padded_observations[head_index:tail_index] next_window = padded_observations[head_index + n:tail_index + n] ref_observation = np.vstack(window) ref_next_observation = np.vstack(next_window) assert observation.shape == ref_observation.shape assert next_observation.shape == ref_next_observation.shape assert np.all(observation == ref_observation) assert np.all(next_observation == ref_next_observation) else: next_t = t for _ in range(n - 1): next_t = next_t.next_transition assert np.allclose(observation, t.observation) assert np.allclose(next_observation, next_t.next_observation) next_reward = 0.0 next_action = 0.0 terminal = 0.0 next_t = t for j in range(n): next_reward += next_t.next_reward * gamma**j next_action = next_t.next_action terminal = next_t.terminal next_t = next_t.next_transition assert np.all(batch.actions[i] == t.action) assert np.all(batch.rewards[i][0] == t.reward) assert np.all(batch.next_actions[i] == next_action) assert np.allclose(batch.next_rewards[i][0], next_reward) assert np.all(batch.terminals[i][0] == terminal) # check mask if create_mask: assert batch.masks.shape == (mask_size, data_size - 1, 1) else: assert batch.masks is None # check additional data value = np.random.random(100) batch.add_additional_data("test", value) assert np.all(batch.get_additional_data("test") == value) # check list-like behavior assert len(batch) == data_size - 1 assert batch[0] is episode.transitions[0] for i, transition in enumerate(batch): assert isinstance(transition, Transition) assert transition is episode.transitions[i]
def test_torch_mini_batch( batch_size, observation_shape, action_size, use_scaler, use_action_scaler ): obs_shape = (batch_size,) + observation_shape transitions = [] for _ in range(batch_size): transition = Transition( observation_shape=observation_shape, action_size=action_size, observation=np.random.random(obs_shape), action=np.random.random(action_size), reward=np.random.random(), next_observation=np.random.random(obs_shape), next_action=np.random.random(action_size), next_reward=np.random.random(), terminal=0.0, ) transitions.append(transition) if use_scaler: class DummyScaler: def transform(self, x): return x + 0.1 scaler = DummyScaler() else: scaler = None if use_action_scaler: class DummyActionScaler: def transform(self, x): return x + 0.2 action_scaler = DummyActionScaler() else: action_scaler = None batch = TransitionMiniBatch(transitions) torch_batch = TorchMiniBatch( batch=batch, device="cpu:0", scaler=scaler, action_scaler=action_scaler ) if use_scaler: assert np.all( torch_batch.observations.numpy() == batch.observations + 0.1 ) assert np.all( torch_batch.next_observations.numpy() == batch.next_observations + 0.1 ) else: assert np.all(torch_batch.observations.numpy() == batch.observations) assert np.all( torch_batch.next_observations.numpy() == batch.next_observations ) if use_action_scaler: assert np.all(torch_batch.actions.numpy() == batch.actions + 0.2) assert np.all( torch_batch.next_actions.numpy() == batch.next_actions + 0.2 ) else: assert np.all(torch_batch.actions.numpy() == batch.actions) assert np.all(torch_batch.next_actions.numpy() == batch.next_actions) assert np.all(torch_batch.rewards.numpy() == batch.rewards) assert np.all(torch_batch.next_rewards.numpy() == batch.next_rewards) assert np.all(torch_batch.terminals.numpy() == batch.terminals) assert np.all(torch_batch.n_steps.numpy() == batch.n_steps)