示例#1
0
def test_normalizer_after_load(tmp_path):
    behavior_spec = mb.setup_test_behavior_specs(
        use_discrete=True, use_visual=False, vector_action_space=[2], vector_obs_space=1
    )
    time_horizon = 6
    trajectory = make_fake_trajectory(
        length=time_horizon,
        max_step_complete=True,
        observation_shapes=[(1,)],
        action_spec=behavior_spec.action_spec,
    )
    # Change half of the obs to 0
    for i in range(3):
        trajectory.steps[i].obs[0] = np.zeros(1, dtype=np.float32)

    trainer_params = TrainerSettings(network_settings=NetworkSettings(normalize=True))
    policy = TFPolicy(0, behavior_spec, trainer_params)

    trajectory_buffer = trajectory.to_agentbuffer()
    policy.update_normalization(trajectory_buffer["vector_obs"])

    # Check that the running mean and variance is correct
    steps, mean, variance = policy.sess.run(
        [policy.normalization_steps, policy.running_mean, policy.running_variance]
    )

    assert steps == 6
    assert mean[0] == 0.5
    assert variance[0] / steps == pytest.approx(0.25, abs=0.01)
    # Save ckpt and load into another policy
    path1 = os.path.join(tmp_path, "runid1")
    model_saver = TFModelSaver(trainer_params, path1)
    model_saver.register(policy)
    mock_brain_name = "MockBrain"
    model_saver.save_checkpoint(mock_brain_name, 6)
    assert len(os.listdir(tmp_path)) > 0
    policy1 = TFPolicy(0, behavior_spec, trainer_params)
    model_saver = TFModelSaver(trainer_params, path1, load=True)
    model_saver.register(policy1)
    model_saver.initialize_or_load(policy1)

    # Make another update to new policy, this time with all 1's
    time_horizon = 10
    trajectory = make_fake_trajectory(
        length=time_horizon,
        max_step_complete=True,
        observation_shapes=[(1,)],
        action_spec=behavior_spec.action_spec,
    )
    trajectory_buffer = trajectory.to_agentbuffer()
    policy1.update_normalization(trajectory_buffer["vector_obs"])

    # Check that the running mean and variance is correct
    steps, mean, variance = policy1.sess.run(
        [policy1.normalization_steps, policy1.running_mean, policy1.running_variance]
    )

    assert steps == 16
    assert mean[0] == 0.8125
    assert variance[0] / steps == pytest.approx(0.152, abs=0.01)
示例#2
0
def create_policy_mock(
    dummy_config: TrainerSettings,
    use_rnn: bool = False,
    use_discrete: bool = True,
    use_visual: bool = False,
    model_path: str = "",
    load: bool = False,
    seed: int = 0,
) -> TFPolicy:
    mock_spec = mb.setup_test_behavior_specs(
        use_discrete,
        use_visual,
        vector_action_space=DISCRETE_ACTION_SPACE
        if use_discrete
        else VECTOR_ACTION_SPACE,
        vector_obs_space=VECTOR_OBS_SPACE,
    )

    trainer_settings = dummy_config
    trainer_settings.keep_checkpoints = 3
    trainer_settings.network_settings.memory = (
        NetworkSettings.MemorySettings() if use_rnn else None
    )
    policy = TFPolicy(
        seed, mock_spec, trainer_settings, model_path=model_path, load=load
    )
    return policy
示例#3
0
def create_bc_module(mock_behavior_specs, bc_settings, use_rnn, tanhresample):
    # model_path = env.external_brain_names[0]
    trainer_config = TrainerSettings()
    trainer_config.network_settings.memory = (NetworkSettings.MemorySettings()
                                              if use_rnn else None)
    policy = TFPolicy(
        0,
        mock_behavior_specs,
        trainer_config,
        "test",
        False,
        tanhresample,
        tanhresample,
    )
    with policy.graph.as_default():
        bc_module = BCModule(
            policy,
            policy_learning_rate=trainer_config.hyperparameters.learning_rate,
            default_batch_size=trainer_config.hyperparameters.batch_size,
            default_num_epoch=3,
            settings=bc_settings,
        )
    policy.initialize_or_load(
    )  # Normally the optimizer calls this after the BCModule is created
    return bc_module
def create_optimizer_mock(trainer_config, reward_signal_config, use_rnn,
                          use_discrete, use_visual):
    mock_specs = mb.setup_test_behavior_specs(
        use_discrete,
        use_visual,
        vector_action_space=DISCRETE_ACTION_SPACE
        if use_discrete else VECTOR_ACTION_SPACE,
        vector_obs_space=VECTOR_OBS_SPACE if not use_visual else 0,
    )
    trainer_settings = trainer_config
    trainer_settings.reward_signals = reward_signal_config
    trainer_settings.network_settings.memory = (NetworkSettings.MemorySettings(
        sequence_length=16, memory_size=10) if use_rnn else None)
    policy = TFPolicy(0,
                      mock_specs,
                      trainer_settings,
                      "test",
                      False,
                      create_tf_graph=False)
    if trainer_settings.trainer_type == TrainerType.SAC:
        optimizer = SACOptimizer(policy, trainer_settings)
    else:
        optimizer = PPOOptimizer(policy, trainer_settings)
    optimizer.policy.initialize()
    return optimizer
示例#5
0
    def create_policy(
        self, parsed_behavior_id: BehaviorIdentifiers, behavior_spec: BehaviorSpec
    ) -> TFPolicy:
        policy = TFPolicy(
            self.seed,
            behavior_spec,
            self.trainer_settings,
            self.artifact_path,
            self.load,
            tanh_squash=True,
            reparameterize=True,
            create_tf_graph=False,
        )
        # Load the replay buffer if load
        if self.load and self.checkpoint_replay_buffer:
            try:
                self.load_replay_buffer()
            except (AttributeError, FileNotFoundError):
                logger.warning(
                    "Replay buffer was unable to load, starting from scratch."
                )
            logger.debug(
                "Loaded update buffer with {} sequences".format(
                    self.update_buffer.num_experiences
                )
            )

        return policy
示例#6
0
def test_normalization():
    behavior_spec = mb.setup_test_behavior_specs(use_discrete=True,
                                                 use_visual=False,
                                                 vector_action_space=[2],
                                                 vector_obs_space=1)

    time_horizon = 6
    trajectory = make_fake_trajectory(
        length=time_horizon,
        max_step_complete=True,
        observation_shapes=[(1, )],
        action_space=[2],
    )
    # Change half of the obs to 0
    for i in range(3):
        trajectory.steps[i].obs[0] = np.zeros(1, dtype=np.float32)
    policy = TFPolicy(
        0,
        behavior_spec,
        TrainerSettings(network_settings=NetworkSettings(normalize=True)),
        "testdir",
        False,
    )

    trajectory_buffer = trajectory.to_agentbuffer()
    policy.update_normalization(trajectory_buffer["vector_obs"])

    # Check that the running mean and variance is correct
    steps, mean, variance = policy.sess.run([
        policy.normalization_steps, policy.running_mean,
        policy.running_variance
    ])

    assert steps == 6
    assert mean[0] == 0.5
    # Note: variance is divided by number of steps, and initialized to 1 to avoid
    # divide by 0. The right answer is 0.25
    assert (variance[0] - 1) / steps == 0.25

    # Make another update, this time with all 1's
    time_horizon = 10
    trajectory = make_fake_trajectory(
        length=time_horizon,
        max_step_complete=True,
        observation_shapes=[(1, )],
        action_space=[2],
    )
    trajectory_buffer = trajectory.to_agentbuffer()
    policy.update_normalization(trajectory_buffer["vector_obs"])

    # Check that the running mean and variance is correct
    steps, mean, variance = policy.sess.run([
        policy.normalization_steps, policy.running_mean,
        policy.running_variance
    ])

    assert steps == 16
    assert mean[0] == 0.8125
    assert (variance[0] - 1) / steps == pytest.approx(0.152, abs=0.01)
示例#7
0
    def create_policy(self, parsed_behavior_id: BehaviorIdentifiers,
                      behavior_spec: BehaviorSpec) -> TFPolicy:
        """
        Creates a PPO policy to trainers list of policies.
        :param behavior_spec: specifications for policy construction
        :return policy
        """
        policy = TFPolicy(
            self.seed,
            behavior_spec,
            self.trainer_settings,
            condition_sigma_on_obs=False,  # Faster training for PPO
            create_tf_graph=
            False,  # We will create the TF graph in the Optimizer
        )

        return policy
def test_step_overflow():
    behavior_spec = mb.setup_test_behavior_specs(use_discrete=True,
                                                 use_visual=False,
                                                 vector_action_space=[2],
                                                 vector_obs_space=1)

    policy = TFPolicy(
        0,
        behavior_spec,
        TrainerSettings(network_settings=NetworkSettings(normalize=True)),
        create_tf_graph=False,
    )
    policy.create_input_placeholders()
    policy.initialize()

    policy.set_step(2**31 - 1)
    assert policy.get_current_step() == 2**31 - 1
    policy.increment_step(3)
    assert policy.get_current_step() == 2**31 + 2
示例#9
0
def create_sac_optimizer_mock(dummy_config, use_rnn, use_discrete, use_visual):
    mock_brain = mb.setup_test_behavior_specs(
        use_discrete,
        use_visual,
        vector_action_space=DISCRETE_ACTION_SPACE
        if use_discrete else VECTOR_ACTION_SPACE,
        vector_obs_space=VECTOR_OBS_SPACE if not use_visual else 0,
    )
    trainer_settings = dummy_config
    trainer_settings.network_settings.memory = (NetworkSettings.MemorySettings(
        sequence_length=16, memory_size=10) if use_rnn else None)
    policy = TFPolicy(0,
                      mock_brain,
                      trainer_settings,
                      "test",
                      False,
                      create_tf_graph=False)
    optimizer = SACOptimizer(policy, trainer_settings)
    return optimizer
示例#10
0
 def create_tf_policy(
     self,
     parsed_behavior_id: BehaviorIdentifiers,
     behavior_spec: BehaviorSpec,
     create_graph: bool = False,
 ) -> TFPolicy:
     """
     Creates a policy with a Tensorflow backend and PPO hyperparameters
     :param parsed_behavior_id:
     :param behavior_spec: specifications for policy construction
     :param create_graph: whether to create the Tensorflow graph on construction
     :return policy
     """
     policy = TFPolicy(
         self.seed,
         behavior_spec,
         self.trainer_settings,
         condition_sigma_on_obs=False,  # Faster training for PPO
         create_tf_graph=create_graph,
     )
     return policy
示例#11
0
def _create_ppo_optimizer_ops_mock(dummy_config, use_rnn, use_discrete, use_visual):
    mock_specs = mb.setup_test_behavior_specs(
        use_discrete,
        use_visual,
        vector_action_space=DISCRETE_ACTION_SPACE
        if use_discrete
        else VECTOR_ACTION_SPACE,
        vector_obs_space=VECTOR_OBS_SPACE,
    )

    trainer_settings = attr.evolve(dummy_config)
    trainer_settings.network_settings.memory = (
        NetworkSettings.MemorySettings(sequence_length=16, memory_size=10)
        if use_rnn
        else None
    )
    policy = TFPolicy(
        0, mock_specs, trainer_settings, "test", False, create_tf_graph=False
    )
    optimizer = PPOOptimizer(policy, trainer_settings)
    policy.initialize()
    return optimizer
示例#12
0
 def create_tf_policy(
     self,
     parsed_behavior_id: BehaviorIdentifiers,
     behavior_spec: BehaviorSpec,
     create_graph: bool = False,
 ) -> TFPolicy:
     """
     Creates a policy with a Tensorflow backend and SAC hyperparameters
     :param parsed_behavior_id:
     :param behavior_spec: specifications for policy construction
     :param create_graph: whether to create the Tensorflow graph on construction
     :return policy
     """
     policy = TFPolicy(
         self.seed,
         behavior_spec,
         self.trainer_settings,
         tanh_squash=True,
         reparameterize=True,
         create_tf_graph=create_graph,
     )
     self.maybe_load_replay_buffer()
     return policy
示例#13
0
def test_large_normalization():
    behavior_spec = mb.setup_test_behavior_specs(
        use_discrete=True, use_visual=False, vector_action_space=[2], vector_obs_space=1
    )
    # Taken from Walker seed 3713 which causes NaN without proper initialization
    large_obs1 = [
        1800.00036621,
        1799.96972656,
        1800.01245117,
        1800.07214355,
        1800.02758789,
        1799.98303223,
        1799.88647461,
        1799.89575195,
        1800.03479004,
        1800.14025879,
        1800.17675781,
        1800.20581055,
        1800.33740234,
        1800.36450195,
        1800.43457031,
        1800.45544434,
        1800.44604492,
        1800.56713867,
        1800.73901367,
    ]
    large_obs2 = [
        1799.99975586,
        1799.96679688,
        1799.92980957,
        1799.89550781,
        1799.93774414,
        1799.95300293,
        1799.94067383,
        1799.92993164,
        1799.84057617,
        1799.69873047,
        1799.70605469,
        1799.82849121,
        1799.85095215,
        1799.76977539,
        1799.78283691,
        1799.76708984,
        1799.67163086,
        1799.59191895,
        1799.5135498,
        1799.45556641,
        1799.3717041,
    ]
    policy = TFPolicy(
        0,
        behavior_spec,
        TrainerSettings(network_settings=NetworkSettings(normalize=True)),
        "testdir",
        False,
    )
    time_horizon = len(large_obs1)
    trajectory = make_fake_trajectory(
        length=time_horizon,
        max_step_complete=True,
        observation_shapes=[(1,)],
        action_space=[2],
    )
    for i in range(time_horizon):
        trajectory.steps[i].obs[0] = np.array([large_obs1[i]], dtype=np.float32)
    trajectory_buffer = trajectory.to_agentbuffer()
    policy.update_normalization(trajectory_buffer["vector_obs"])

    # Check that the running mean and variance is correct
    steps, mean, variance = policy.sess.run(
        [policy.normalization_steps, policy.running_mean, policy.running_variance]
    )
    assert mean[0] == pytest.approx(np.mean(large_obs1, dtype=np.float32), abs=0.01)
    assert variance[0] / steps == pytest.approx(
        np.var(large_obs1, dtype=np.float32), abs=0.01
    )

    time_horizon = len(large_obs2)
    trajectory = make_fake_trajectory(
        length=time_horizon,
        max_step_complete=True,
        observation_shapes=[(1,)],
        action_space=[2],
    )
    for i in range(time_horizon):
        trajectory.steps[i].obs[0] = np.array([large_obs2[i]], dtype=np.float32)

    trajectory_buffer = trajectory.to_agentbuffer()
    policy.update_normalization(trajectory_buffer["vector_obs"])

    steps, mean, variance = policy.sess.run(
        [policy.normalization_steps, policy.running_mean, policy.running_variance]
    )

    assert mean[0] == pytest.approx(
        np.mean(large_obs1 + large_obs2, dtype=np.float32), abs=0.01
    )
    assert variance[0] / steps == pytest.approx(
        np.var(large_obs1 + large_obs2, dtype=np.float32), abs=0.01
    )