def test_reward_decreases(demo_to_buffer: Any, use_actions: bool, behavior_spec: BehaviorSpec, seed: int) -> None: np.random.seed(seed) torch.manual_seed(seed) buffer_expert = create_agent_buffer(behavior_spec, 1000) buffer_policy = create_agent_buffer(behavior_spec, 1000) demo_to_buffer.return_value = None, buffer_expert gail_settings = GAILSettings(demo_path="", learning_rate=0.005, use_vail=False, use_actions=use_actions) gail_rp = create_reward_provider(RewardSignalType.GAIL, behavior_spec, gail_settings) init_reward_expert = gail_rp.evaluate(buffer_expert)[0] init_reward_policy = gail_rp.evaluate(buffer_policy)[0] for _ in range(10): gail_rp.update(buffer_policy) reward_expert = gail_rp.evaluate(buffer_expert)[0] reward_policy = gail_rp.evaluate(buffer_policy)[0] assert reward_expert >= 0 # GAIL / VAIL reward always positive assert reward_policy >= 0 reward_expert = gail_rp.evaluate(buffer_expert)[0] reward_policy = gail_rp.evaluate(buffer_policy)[0] assert reward_expert > reward_policy # Expert reward greater than non-expert reward assert (reward_expert > init_reward_expert ) # Expert reward getting better as network trains assert (reward_policy < init_reward_policy ) # Non-expert reward getting worse as network trains
def test_gail_visual_sac(simple_record, use_discrete): demo_path = simple_record(use_discrete, num_visual=1, num_vector=0) env = SimpleEnvironment( [BRAIN_NAME], num_visual=1, num_vector=0, use_discrete=use_discrete, step_size=0.2, ) bc_settings = BehavioralCloningSettings(demo_path=demo_path, steps=1000) reward_signals = { RewardSignalType.GAIL: GAILSettings(encoding_size=32, demo_path=demo_path) } hyperparams = attr.evolve(SAC_TF_CONFIG.hyperparameters, learning_rate=3e-4, batch_size=16) config = attr.evolve( SAC_TF_CONFIG, reward_signals=reward_signals, hyperparameters=hyperparams, behavioral_cloning=bc_settings, max_steps=500, framework=FrameworkType.TENSORFLOW, ) _check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)
def test_reward_decreases_vail(demo_to_buffer: Any, use_actions: bool, behavior_spec: BehaviorSpec, seed: int) -> None: np.random.seed(seed) torch.manual_seed(seed) buffer_expert = create_agent_buffer(behavior_spec, 1000) buffer_policy = create_agent_buffer(behavior_spec, 1000) demo_to_buffer.return_value = None, buffer_expert gail_settings = GAILSettings(demo_path="", learning_rate=0.005, use_vail=True, use_actions=use_actions) DiscriminatorNetwork.initial_beta = 0.0 # we must set the initial value of beta to 0 for testing # If we do not, the kl-loss will dominate early and will block the estimator gail_rp = create_reward_provider(RewardSignalType.GAIL, behavior_spec, gail_settings) for _ in range(300): gail_rp.update(buffer_policy) reward_expert = gail_rp.evaluate(buffer_expert)[0] reward_policy = gail_rp.evaluate(buffer_policy)[0] assert reward_expert >= 0 # GAIL / VAIL reward always positive assert reward_policy >= 0 reward_expert = gail_rp.evaluate(buffer_expert)[0] reward_policy = gail_rp.evaluate(buffer_policy)[0] assert reward_expert > reward_policy # Expert reward greater than non-expert reward
def test_gail_dc_visual(trainer_config, gail_dummy_config): gail_dummy_config_discrete = { RewardSignalType.GAIL: GAILSettings(demo_path=DISCRETE_PATH) } optimizer = create_optimizer_mock(trainer_config, gail_dummy_config_discrete, False, True, True) reward_signal_eval(optimizer, "gail") reward_signal_update(optimizer, "gail")
def test_reward_provider_save(tmp_path, optimizer): OptimizerClass, HyperparametersClass = optimizer trainer_settings = TrainerSettings() trainer_settings.hyperparameters = HyperparametersClass() trainer_settings.reward_signals = { RewardSignalType.CURIOSITY: CuriositySettings(), RewardSignalType.GAIL: GAILSettings(demo_path=DEMO_PATH), RewardSignalType.RND: RNDSettings(), } policy = create_policy_mock(trainer_settings, use_discrete=False) optimizer = OptimizerClass(policy, trainer_settings) # save at path 1 path1 = os.path.join(tmp_path, "runid1") model_saver = TorchModelSaver(trainer_settings, path1) model_saver.register(policy) model_saver.register(optimizer) model_saver.initialize_or_load() policy.set_step(2000) model_saver.save_checkpoint("MockBrain", 2000) # create a new optimizer and policy optimizer2 = OptimizerClass(policy, trainer_settings) policy2 = create_policy_mock(trainer_settings, use_discrete=False) # load weights model_saver2 = TorchModelSaver(trainer_settings, path1, load=True) model_saver2.register(policy2) model_saver2.register(optimizer2) model_saver2.initialize_or_load() # This is to load the optimizers # assert the models have the same weights module_dict_1 = optimizer.get_modules() module_dict_2 = optimizer2.get_modules() assert "Module:GAIL" in module_dict_1 assert "Module:GAIL" in module_dict_2 assert "Module:Curiosity" in module_dict_1 assert "Module:Curiosity" in module_dict_2 assert "Module:RND-pred" in module_dict_1 assert "Module:RND-pred" in module_dict_2 assert "Module:RND-target" in module_dict_1 assert "Module:RND-target" in module_dict_2 for name, module1 in module_dict_1.items(): assert name in module_dict_2 module2 = module_dict_2[name] if hasattr(module1, "parameters"): for param1, param2 in zip(module1.parameters(), module2.parameters()): assert param1.data.ne(param2.data).sum() == 0 # Run some rewards data = create_agent_buffer(policy.behavior_spec, 1) for reward_name in optimizer.reward_signals.keys(): rp_1 = optimizer.reward_signals[reward_name] rp_2 = optimizer2.reward_signals[reward_name] assert np.array_equal(rp_1.evaluate(data), rp_2.evaluate(data))
def test_gail(simple_record, use_discrete, trainer_config): demo_path = simple_record(use_discrete) env = SimpleEnvironment([BRAIN_NAME], use_discrete=use_discrete, step_size=0.2) bc_settings = BehavioralCloningSettings(demo_path=demo_path, steps=1000) reward_signals = { RewardSignalType.GAIL: GAILSettings(encoding_size=32, demo_path=demo_path) } config = attr.evolve( trainer_config, reward_signals=reward_signals, behavioral_cloning=bc_settings, max_steps=500, ) _check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)
def test_gail(simple_record, action_sizes, trainer_config): demo_path = simple_record(action_sizes) env = SimpleEnvironment([BRAIN_NAME], action_sizes=action_sizes, step_size=0.2) bc_settings = BehavioralCloningSettings(demo_path=demo_path, steps=1000) reward_signals = { RewardSignalType.GAIL: GAILSettings(encoding_size=32, demo_path=demo_path) } config = attr.evolve( trainer_config, reward_signals=reward_signals, behavioral_cloning=bc_settings, max_steps=500, framework=FrameworkType.TENSORFLOW, ) _check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)
def test_load_different_reward_provider(caplog, tmp_path, optimizer): OptimizerClass, HyperparametersClass = optimizer trainer_settings = TrainerSettings() trainer_settings.hyperparameters = HyperparametersClass() trainer_settings.reward_signals = { RewardSignalType.CURIOSITY: CuriositySettings(), RewardSignalType.RND: RNDSettings(), } policy = create_policy_mock(trainer_settings, use_discrete=False) optimizer = OptimizerClass(policy, trainer_settings) # save at path 1 path1 = os.path.join(tmp_path, "runid1") model_saver = TorchModelSaver(trainer_settings, path1) model_saver.register(policy) model_saver.register(optimizer) model_saver.initialize_or_load() assert len(optimizer.critic.value_heads.stream_names) == 2 policy.set_step(2000) model_saver.save_checkpoint("MockBrain", 2000) trainer_settings2 = TrainerSettings() trainer_settings2.hyperparameters = HyperparametersClass() trainer_settings2.reward_signals = { RewardSignalType.GAIL: GAILSettings(demo_path=DEMO_PATH) } # create a new optimizer and policy policy2 = create_policy_mock(trainer_settings2, use_discrete=False) optimizer2 = OptimizerClass(policy2, trainer_settings2) # load weights model_saver2 = TorchModelSaver(trainer_settings2, path1, load=True) model_saver2.register(policy2) model_saver2.register(optimizer2) assert len(optimizer2.critic.value_heads.stream_names) == 1 model_saver2.initialize_or_load() # This is to load the optimizers messages = [ rec.message for rec in caplog.records if rec.levelno == WARNING ] assert len(messages) > 0
def test_gail_visual_ppo(simple_record, action_sizes): demo_path = simple_record(action_sizes, num_visual=1, num_vector=0) env = SimpleEnvironment( [BRAIN_NAME], num_visual=1, num_vector=0, action_sizes=action_sizes, step_size=0.2, ) bc_settings = BehavioralCloningSettings(demo_path=demo_path, steps=1500) reward_signals = { RewardSignalType.GAIL: GAILSettings(encoding_size=32, demo_path=demo_path) } hyperparams = attr.evolve(PPO_TORCH_CONFIG.hyperparameters, learning_rate=5e-3) config = attr.evolve( PPO_TORCH_CONFIG, reward_signals=reward_signals, hyperparameters=hyperparams, behavioral_cloning=bc_settings, max_steps=1000, ) check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)
def gail_dummy_config(): return {RewardSignalType.GAIL: GAILSettings(demo_path=CONTINUOUS_PATH)}
def test_factory(behavior_spec: BehaviorSpec) -> None: gail_settings = GAILSettings(demo_path=CONTINUOUS_PATH) gail_rp = create_reward_provider(RewardSignalType.GAIL, behavior_spec, gail_settings) assert gail_rp.name == "GAIL"
def test_construction(behavior_spec: BehaviorSpec) -> None: gail_settings = GAILSettings(demo_path=CONTINUOUS_PATH) gail_rp = GAILRewardProvider(behavior_spec, gail_settings) assert gail_rp.name == "GAIL"