def _create_merlin_algorithm(env, encoder_fc_layers=(3, ), latent_dim=4, lstm_size=(4, ), memory_size=20, learning_rate=1e-3, debug_summaries=True): config = TrainerConfig(root_dir="dummy", unroll_length=6) observation_spec = env.observation_spec() action_spec = env.action_spec() algorithm = MerlinAlgorithm( observation_spec=observation_spec, action_spec=action_spec, env=env, config=config, encoders=alf.networks.EncodingNetwork( input_tensor_spec=observation_spec, fc_layer_params=encoder_fc_layers, activation=math_ops.identity, name="ObsEncoder"), decoders=DecodingAlgorithm(decoder=alf.networks.EncodingNetwork( input_tensor_spec=alf.TensorSpec((latent_dim, )), fc_layer_params=encoder_fc_layers, activation=math_ops.identity, name="ObsDecoder"), loss_weight=100.), latent_dim=latent_dim, lstm_size=lstm_size, memory_size=memory_size, optimizer=alf.optimizers.AdamTF(lr=learning_rate), debug_summaries=debug_summaries) return algorithm
def _test_off_policy_algorithm(self, root_dir): alf.summary.enable_summary() config = TrainerConfig(root_dir=root_dir, unroll_length=5, num_envs=1, num_updates_per_train_iter=1, mini_batch_length=5, mini_batch_size=3, use_rollout_state=True, summarize_grads_and_vars=True, summarize_action_distributions=True, whole_replay_buffer_training=True) env = MyEnv(batch_size=3) alg = MyAlg(observation_spec=env.observation_spec(), action_spec=env.action_spec(), env=env, on_policy=False, config=config) for _ in range(100): alg.train_iter() time_step = common.get_initial_time_step(env) state = alg.get_initial_predict_state(env.batch_size) policy_step = alg.rollout_step(time_step, state) logits = policy_step.info.log_prob(torch.arange(3).reshape(3, 1)) print("logits: ", logits) self.assertTrue(torch.all(logits[1, :] > logits[0, :])) self.assertTrue(torch.all(logits[1, :] > logits[2, :]))
def create_algorithm(env): config = TrainerConfig(root_dir="dummy", unroll_length=5) obs_spec = alf.TensorSpec((2, ), dtype='float32') action_spec = alf.BoundedTensorSpec( shape=(), dtype='int32', minimum=0, maximum=2) fc_layer_params = (10, 8, 6) actor_network = partial( ActorDistributionNetwork, fc_layer_params=fc_layer_params, discrete_projection_net_ctor=alf.networks.CategoricalProjectionNetwork) value_network = partial(ValueNetwork, fc_layer_params=(10, 8, 1)) alg = ActorCriticAlgorithm( observation_spec=obs_spec, action_spec=action_spec, actor_network_ctor=actor_network, value_network_ctor=value_network, env=env, config=config, optimizer=alf.optimizers.Adam(lr=1e-2), debug_summaries=True, name="MyActorCritic") return alg
def test_sac_algorithm_discrete(self, use_parallel_network): num_env = 1 config = TrainerConfig( root_dir="dummy", unroll_length=1, mini_batch_length=2, mini_batch_size=64, initial_collect_steps=500, whole_replay_buffer_training=False, clear_replay_buffer=False, num_envs=num_env, ) env_class = PolicyUnittestEnv steps_per_episode = 13 env = env_class(num_env, steps_per_episode, action_type=ActionType.Discrete) eval_env = env_class(100, steps_per_episode, action_type=ActionType.Discrete) obs_spec = env._observation_spec action_spec = env._action_spec fc_layer_params = (10, 10) q_network = partial(QNetwork, fc_layer_params=fc_layer_params) alg2 = SacAlgorithm(observation_spec=obs_spec, action_spec=action_spec, q_network_cls=q_network, use_parallel_network=use_parallel_network, env=env, config=config, critic_optimizer=alf.optimizers.Adam(lr=1e-3), alpha_optimizer=alf.optimizers.Adam(lr=1e-2), debug_summaries=False, name="MySAC") eval_env.reset() for i in range(700): alg2.train_iter() if i < config.initial_collect_steps: continue eval_env.reset() eval_time_step = unroll(eval_env, alg2, steps_per_episode - 1) logging.log_every_n_seconds( logging.INFO, "%d reward=%f" % (i, float(eval_time_step.reward.mean())), n_seconds=1) self.assertAlmostEqual(1.0, float(eval_time_step.reward.mean()), delta=0.2)
def test_trac_algorithm(self): config = TrainerConfig(root_dir="dummy", unroll_length=5) env = MyEnv(batch_size=3) alg = TracAlgorithm(observation_spec=env.observation_spec(), action_spec=env.action_spec(), ac_algorithm_cls=create_ac_algorithm, env=env, config=config) for _ in range(50): alg.train_iter() time_step = common.get_initial_time_step(env) state = alg.get_initial_predict_state(env.batch_size) policy_step = alg.rollout_step(time_step, state) logits = policy_step.info.action_distribution.log_prob( torch.arange(3).reshape(3, 1)) print("logits: ", logits) # action 1 gets the most reward. So its probability should be higher # than other actions after training. self.assertTrue(torch.all(logits[1, :] > logits[0, :])) self.assertTrue(torch.all(logits[1, :] > logits[2, :]))
def test_on_policy_algorithm(self): # root_dir is not used. We have to give it a value because # it is a required argument of TrainerConfig. config = TrainerConfig(root_dir='/tmp/rl_algorithm_test', unroll_length=5, num_envs=1) env = MyEnv(batch_size=3) alg = MyAlg(observation_spec=env.observation_spec(), action_spec=env.action_spec(), env=env, config=config, on_policy=True, debug_summaries=True) for _ in range(100): alg.train_iter() time_step = common.get_initial_time_step(env) state = alg.get_initial_predict_state(env.batch_size) policy_step = alg.rollout_step(time_step, state) logits = policy_step.info.log_prob(torch.arange(3).reshape(3, 1)) print("logits: ", logits) self.assertTrue(torch.all(logits[1, :] > logits[0, :])) self.assertTrue(torch.all(logits[1, :] > logits[2, :]))
def create_algorithm(env, use_rnn=False, learning_rate=1e-1): observation_spec = env.observation_spec() action_spec = env.action_spec() if use_rnn: actor_net = partial(ActorDistributionRNNNetwork, fc_layer_params=(), lstm_hidden_size=(4, ), actor_fc_layer_params=()) value_net = partial(ValueRNNNetwork, fc_layer_params=(), lstm_hidden_size=(4, ), value_fc_layer_params=()) else: actor_net = partial( ActorDistributionNetwork, fc_layer_params=(), continuous_projection_net_ctor=StableNormalProjectionNetwork) value_net = partial(ValueNetwork, observation_spec, fc_layer_params=()) optimizer = alf.optimizers.Adam(lr=learning_rate) config = TrainerConfig(root_dir="dummy", unroll_length=13, num_updates_per_train_iter=4, mini_batch_size=25, summarize_grads_and_vars=DEBUGGING) return PPOAlgorithm(observation_spec=observation_spec, action_spec=action_spec, env=env, config=config, actor_network_ctor=actor_net, value_network_ctor=value_net, loss=PPOLoss(gamma=1.0, debug_summaries=DEBUGGING), optimizer=optimizer, debug_summaries=DEBUGGING)
def test_ddpg_algorithm(self, num_critic_replicas, reward_dim): num_env = 128 num_eval_env = 100 steps_per_episode = 13 config = TrainerConfig( root_dir="dummy", unroll_length=steps_per_episode, mini_batch_length=2, mini_batch_size=128, initial_collect_steps=steps_per_episode, whole_replay_buffer_training=False, clear_replay_buffer=False, num_envs=num_env, ) env_class = PolicyUnittestEnv env = env_class(num_env, steps_per_episode, action_type=ActionType.Continuous, reward_dim=reward_dim) eval_env = env_class(num_eval_env, steps_per_episode, action_type=ActionType.Continuous, reward_dim=reward_dim) obs_spec = env._observation_spec action_spec = env._action_spec fc_layer_params = (16, 16) actor_network = functools.partial(ActorNetwork, fc_layer_params=fc_layer_params) critic_network = functools.partial( CriticNetwork, output_tensor_spec=env.reward_spec(), joint_fc_layer_params=fc_layer_params) alg = DdpgAlgorithm(observation_spec=obs_spec, action_spec=action_spec, actor_network_ctor=actor_network, critic_network_ctor=critic_network, reward_weights=[1, 2, 3], env=env, config=config, num_critic_replicas=num_critic_replicas, use_parallel_network=num_critic_replicas > 1, actor_optimizer=alf.optimizers.Adam(lr=1e-2), critic_optimizer=alf.optimizers.Adam(lr=1e-2), debug_summaries=False, name="MyDDPG") for _ in range(500): alg.train_iter() eval_env.reset() epsilon_greedy = 0.0 eval_time_step = unroll(eval_env, alg, steps_per_episode - 1, epsilon_greedy) print(eval_time_step.reward.mean()) self.assertAlmostEqual(1.0, float(eval_time_step.reward.mean()), delta=2e-1)
def test_sac_algorithm(self, use_parallel_network, reward_dim): num_env = 1 config = TrainerConfig( root_dir="dummy", unroll_length=1, mini_batch_length=2, mini_batch_size=64, initial_collect_steps=500, whole_replay_buffer_training=False, clear_replay_buffer=False, num_envs=1, ) env_class = PolicyUnittestEnv steps_per_episode = 13 env = env_class(num_env, steps_per_episode, action_type=ActionType.Continuous, reward_dim=reward_dim) eval_env = env_class(100, steps_per_episode, action_type=ActionType.Continuous, reward_dim=reward_dim) obs_spec = env._observation_spec action_spec = env._action_spec fc_layer_params = (10, 10) continuous_projection_net_ctor = partial( alf.networks.NormalProjectionNetwork, state_dependent_std=True, scale_distribution=True, std_transform=clipped_exp) actor_network = partial( ActorDistributionNetwork, fc_layer_params=fc_layer_params, continuous_projection_net_ctor=continuous_projection_net_ctor) critic_network = partial(CriticNetwork, output_tensor_spec=env.reward_spec(), joint_fc_layer_params=fc_layer_params) alg = SacAlgorithm(observation_spec=obs_spec, action_spec=action_spec, actor_network_cls=actor_network, critic_network_cls=critic_network, use_parallel_network=use_parallel_network, use_entropy_reward=reward_dim == 1, env=env, config=config, actor_optimizer=alf.optimizers.Adam(lr=1e-2), critic_optimizer=alf.optimizers.Adam(lr=1e-2), alpha_optimizer=alf.optimizers.Adam(lr=1e-2), debug_summaries=False, name="MySAC") eval_env.reset() for i in range(700): alg.train_iter() if i < config.initial_collect_steps: continue eval_env.reset() eval_time_step = unroll(eval_env, alg, steps_per_episode - 1) logging.log_every_n_seconds( logging.INFO, "%d reward=%f" % (i, float(eval_time_step.reward.mean())), n_seconds=1) self.assertAlmostEqual(1.0, float(eval_time_step.reward.mean()), delta=0.3)
def _create_algorithm(env, sac, use_rnn, on_policy, priority_replay): observation_spec = env.observation_spec() action_spec = env.action_spec() fc_layer_params = (16, 16) continuous_projection_net_ctor = functools.partial( alf.networks.NormalProjectionNetwork, state_dependent_std=True, scale_distribution=True, std_transform=clipped_exp) if use_rnn: if sac: actor_net = functools.partial( ActorDistributionRNNNetwork, fc_layer_params=fc_layer_params, lstm_hidden_size=(4, ), continuous_projection_net_ctor=continuous_projection_net_ctor) else: actor_net = functools.partial(ActorRNNNetwork, fc_layer_params=fc_layer_params, lstm_hidden_size=(4, )) critic_net = functools.partial(CriticRNNNetwork, joint_fc_layer_params=fc_layer_params, lstm_hidden_size=(4, )) else: if sac: actor_net = functools.partial( ActorDistributionNetwork, fc_layer_params=fc_layer_params, continuous_projection_net_ctor=continuous_projection_net_ctor) else: actor_net = functools.partial(ActorNetwork, fc_layer_params=fc_layer_params) critic_net = functools.partial(CriticNetwork, joint_fc_layer_params=fc_layer_params) config = TrainerConfig(root_dir="dummy", unroll_length=2, initial_collect_steps=12 * 128 * 5, use_rollout_state=True, mini_batch_length=1, mini_batch_size=256, num_updates_per_train_iter=1, whole_replay_buffer_training=False, clear_replay_buffer=False, priority_replay=priority_replay, debug_summaries=DEBUGGING, summarize_grads_and_vars=DEBUGGING, summarize_action_distributions=DEBUGGING) return SarsaAlgorithm(observation_spec=observation_spec, action_spec=action_spec, env=env, config=config, on_policy=on_policy, ou_stddev=0.2, ou_damping=0.5, actor_network_ctor=actor_net, critic_network_ctor=critic_net, actor_optimizer=alf.optimizers.AdamTF(lr=5e-3), critic_optimizer=alf.optimizers.AdamTF(lr=2e-2), alpha_optimizer=alf.optimizers.AdamTF(lr=2e-2), debug_summaries=DEBUGGING)