示例#1
0
    def _test_off_policy_algorithm(self, root_dir):
        alf.summary.enable_summary()
        config = TrainerConfig(root_dir=root_dir,
                               unroll_length=5,
                               num_envs=1,
                               num_updates_per_train_iter=1,
                               mini_batch_length=5,
                               mini_batch_size=3,
                               use_rollout_state=True,
                               summarize_grads_and_vars=True,
                               summarize_action_distributions=True,
                               whole_replay_buffer_training=True)
        env = MyEnv(batch_size=3)
        alg = MyAlg(observation_spec=env.observation_spec(),
                    action_spec=env.action_spec(),
                    env=env,
                    on_policy=False,
                    config=config)
        for _ in range(100):
            alg.train_iter()

        time_step = common.get_initial_time_step(env)
        state = alg.get_initial_predict_state(env.batch_size)
        policy_step = alg.rollout_step(time_step, state)
        logits = policy_step.info.log_prob(torch.arange(3).reshape(3, 1))
        print("logits: ", logits)
        self.assertTrue(torch.all(logits[1, :] > logits[0, :]))
        self.assertTrue(torch.all(logits[1, :] > logits[2, :]))
示例#2
0
def create_algorithm(env):
    config = TrainerConfig(root_dir="dummy", unroll_length=5)
    obs_spec = alf.TensorSpec((2, ), dtype='float32')
    action_spec = alf.BoundedTensorSpec(
        shape=(), dtype='int32', minimum=0, maximum=2)

    fc_layer_params = (10, 8, 6)

    actor_network = partial(
        ActorDistributionNetwork,
        fc_layer_params=fc_layer_params,
        discrete_projection_net_ctor=alf.networks.CategoricalProjectionNetwork)

    value_network = partial(ValueNetwork, fc_layer_params=(10, 8, 1))

    alg = ActorCriticAlgorithm(
        observation_spec=obs_spec,
        action_spec=action_spec,
        actor_network_ctor=actor_network,
        value_network_ctor=value_network,
        env=env,
        config=config,
        optimizer=alf.optimizers.Adam(lr=1e-2),
        debug_summaries=True,
        name="MyActorCritic")
    return alg
示例#3
0
def _create_merlin_algorithm(env,
                             encoder_fc_layers=(3, ),
                             latent_dim=4,
                             lstm_size=(4, ),
                             memory_size=20,
                             learning_rate=1e-3,
                             debug_summaries=True):
    config = TrainerConfig(root_dir="dummy", unroll_length=6)
    observation_spec = env.observation_spec()
    action_spec = env.action_spec()
    algorithm = MerlinAlgorithm(
        observation_spec=observation_spec,
        action_spec=action_spec,
        env=env,
        config=config,
        encoders=alf.networks.EncodingNetwork(
            input_tensor_spec=observation_spec,
            fc_layer_params=encoder_fc_layers,
            activation=math_ops.identity,
            name="ObsEncoder"),
        decoders=DecodingAlgorithm(decoder=alf.networks.EncodingNetwork(
            input_tensor_spec=alf.TensorSpec((latent_dim, )),
            fc_layer_params=encoder_fc_layers,
            activation=math_ops.identity,
            name="ObsDecoder"),
                                   loss_weight=100.),
        latent_dim=latent_dim,
        lstm_size=lstm_size,
        memory_size=memory_size,
        optimizer=alf.optimizers.AdamTF(lr=learning_rate),
        debug_summaries=debug_summaries)

    return algorithm
示例#4
0
    def __init__(self, config: TrainerConfig):
        """Create a SLTrainer

        Args:
            config (TrainerConfig): configuration used to construct this trainer
        """
        super().__init__(config)

        assert config.num_iterations > 0, \
            "Must provide num_iterations for training!"

        self._num_epochs = config.num_iterations
        self._trainer_progress.set_termination_criterion(self._num_epochs)

        trainset, testset = self._create_dataset()
        input_tensor_spec = TensorSpec(shape=trainset.dataset[0][0].shape)
        if hasattr(trainset.dataset, 'classes'):
            output_dim = len(trainset.dataset.classes)
        else:
            output_dim = len(trainset.dataset[0][1])

        self._algorithm = config.algorithm_ctor(
            input_tensor_spec=input_tensor_spec,
            last_layer_param=(output_dim, True),
            last_activation=math_ops.identity,
            config=config)

        self._algorithm.set_data_loader(trainset, testset)
示例#5
0
    def test_sac_algorithm_discrete(self, use_parallel_network):
        num_env = 1
        config = TrainerConfig(
            root_dir="dummy",
            unroll_length=1,
            mini_batch_length=2,
            mini_batch_size=64,
            initial_collect_steps=500,
            whole_replay_buffer_training=False,
            clear_replay_buffer=False,
            num_envs=num_env,
        )
        env_class = PolicyUnittestEnv

        steps_per_episode = 13
        env = env_class(num_env,
                        steps_per_episode,
                        action_type=ActionType.Discrete)

        eval_env = env_class(100,
                             steps_per_episode,
                             action_type=ActionType.Discrete)

        obs_spec = env._observation_spec
        action_spec = env._action_spec

        fc_layer_params = (10, 10)

        q_network = partial(QNetwork, fc_layer_params=fc_layer_params)

        alg2 = SacAlgorithm(observation_spec=obs_spec,
                            action_spec=action_spec,
                            q_network_cls=q_network,
                            use_parallel_network=use_parallel_network,
                            env=env,
                            config=config,
                            critic_optimizer=alf.optimizers.Adam(lr=1e-3),
                            alpha_optimizer=alf.optimizers.Adam(lr=1e-2),
                            debug_summaries=False,
                            name="MySAC")

        eval_env.reset()
        for i in range(700):
            alg2.train_iter()
            if i < config.initial_collect_steps:
                continue
            eval_env.reset()
            eval_time_step = unroll(eval_env, alg2, steps_per_episode - 1)
            logging.log_every_n_seconds(
                logging.INFO,
                "%d reward=%f" % (i, float(eval_time_step.reward.mean())),
                n_seconds=1)

        self.assertAlmostEqual(1.0,
                               float(eval_time_step.reward.mean()),
                               delta=0.2)
示例#6
0
    def test_trac_algorithm(self):
        config = TrainerConfig(root_dir="dummy", unroll_length=5)
        env = MyEnv(batch_size=3)
        alg = TracAlgorithm(observation_spec=env.observation_spec(),
                            action_spec=env.action_spec(),
                            ac_algorithm_cls=create_ac_algorithm,
                            env=env,
                            config=config)

        for _ in range(50):
            alg.train_iter()

        time_step = common.get_initial_time_step(env)
        state = alg.get_initial_predict_state(env.batch_size)
        policy_step = alg.rollout_step(time_step, state)
        logits = policy_step.info.action_distribution.log_prob(
            torch.arange(3).reshape(3, 1))
        print("logits: ", logits)
        # action 1 gets the most reward. So its probability should be higher
        # than other actions after training.
        self.assertTrue(torch.all(logits[1, :] > logits[0, :]))
        self.assertTrue(torch.all(logits[1, :] > logits[2, :]))
示例#7
0
    def test_on_policy_algorithm(self):
        # root_dir is not used. We have to give it a value because
        # it is a required argument of TrainerConfig.
        config = TrainerConfig(root_dir='/tmp/rl_algorithm_test',
                               unroll_length=5,
                               num_envs=1)
        env = MyEnv(batch_size=3)
        alg = MyAlg(observation_spec=env.observation_spec(),
                    action_spec=env.action_spec(),
                    env=env,
                    config=config,
                    on_policy=True,
                    debug_summaries=True)
        for _ in range(100):
            alg.train_iter()

        time_step = common.get_initial_time_step(env)
        state = alg.get_initial_predict_state(env.batch_size)
        policy_step = alg.rollout_step(time_step, state)
        logits = policy_step.info.log_prob(torch.arange(3).reshape(3, 1))
        print("logits: ", logits)
        self.assertTrue(torch.all(logits[1, :] > logits[0, :]))
        self.assertTrue(torch.all(logits[1, :] > logits[2, :]))
示例#8
0
def create_algorithm(env, use_rnn=False, learning_rate=1e-1):
    observation_spec = env.observation_spec()
    action_spec = env.action_spec()

    if use_rnn:
        actor_net = partial(ActorDistributionRNNNetwork,
                            fc_layer_params=(),
                            lstm_hidden_size=(4, ),
                            actor_fc_layer_params=())
        value_net = partial(ValueRNNNetwork,
                            fc_layer_params=(),
                            lstm_hidden_size=(4, ),
                            value_fc_layer_params=())
    else:
        actor_net = partial(
            ActorDistributionNetwork,
            fc_layer_params=(),
            continuous_projection_net_ctor=StableNormalProjectionNetwork)
        value_net = partial(ValueNetwork, observation_spec, fc_layer_params=())

    optimizer = alf.optimizers.Adam(lr=learning_rate)

    config = TrainerConfig(root_dir="dummy",
                           unroll_length=13,
                           num_updates_per_train_iter=4,
                           mini_batch_size=25,
                           summarize_grads_and_vars=DEBUGGING)

    return PPOAlgorithm(observation_spec=observation_spec,
                        action_spec=action_spec,
                        env=env,
                        config=config,
                        actor_network_ctor=actor_net,
                        value_network_ctor=value_net,
                        loss=PPOLoss(gamma=1.0, debug_summaries=DEBUGGING),
                        optimizer=optimizer,
                        debug_summaries=DEBUGGING)
示例#9
0
    def test_ddpg_algorithm(self, num_critic_replicas, reward_dim):
        num_env = 128
        num_eval_env = 100
        steps_per_episode = 13
        config = TrainerConfig(
            root_dir="dummy",
            unroll_length=steps_per_episode,
            mini_batch_length=2,
            mini_batch_size=128,
            initial_collect_steps=steps_per_episode,
            whole_replay_buffer_training=False,
            clear_replay_buffer=False,
            num_envs=num_env,
        )
        env_class = PolicyUnittestEnv

        env = env_class(num_env,
                        steps_per_episode,
                        action_type=ActionType.Continuous,
                        reward_dim=reward_dim)

        eval_env = env_class(num_eval_env,
                             steps_per_episode,
                             action_type=ActionType.Continuous,
                             reward_dim=reward_dim)

        obs_spec = env._observation_spec
        action_spec = env._action_spec

        fc_layer_params = (16, 16)

        actor_network = functools.partial(ActorNetwork,
                                          fc_layer_params=fc_layer_params)

        critic_network = functools.partial(
            CriticNetwork,
            output_tensor_spec=env.reward_spec(),
            joint_fc_layer_params=fc_layer_params)

        alg = DdpgAlgorithm(observation_spec=obs_spec,
                            action_spec=action_spec,
                            actor_network_ctor=actor_network,
                            critic_network_ctor=critic_network,
                            reward_weights=[1, 2, 3],
                            env=env,
                            config=config,
                            num_critic_replicas=num_critic_replicas,
                            use_parallel_network=num_critic_replicas > 1,
                            actor_optimizer=alf.optimizers.Adam(lr=1e-2),
                            critic_optimizer=alf.optimizers.Adam(lr=1e-2),
                            debug_summaries=False,
                            name="MyDDPG")

        for _ in range(500):
            alg.train_iter()

        eval_env.reset()
        epsilon_greedy = 0.0
        eval_time_step = unroll(eval_env, alg, steps_per_episode - 1,
                                epsilon_greedy)
        print(eval_time_step.reward.mean())

        self.assertAlmostEqual(1.0,
                               float(eval_time_step.reward.mean()),
                               delta=2e-1)
示例#10
0
    def test_sac_algorithm(self, use_parallel_network, reward_dim):
        num_env = 1
        config = TrainerConfig(
            root_dir="dummy",
            unroll_length=1,
            mini_batch_length=2,
            mini_batch_size=64,
            initial_collect_steps=500,
            whole_replay_buffer_training=False,
            clear_replay_buffer=False,
            num_envs=1,
        )
        env_class = PolicyUnittestEnv
        steps_per_episode = 13
        env = env_class(num_env,
                        steps_per_episode,
                        action_type=ActionType.Continuous,
                        reward_dim=reward_dim)

        eval_env = env_class(100,
                             steps_per_episode,
                             action_type=ActionType.Continuous,
                             reward_dim=reward_dim)

        obs_spec = env._observation_spec
        action_spec = env._action_spec

        fc_layer_params = (10, 10)

        continuous_projection_net_ctor = partial(
            alf.networks.NormalProjectionNetwork,
            state_dependent_std=True,
            scale_distribution=True,
            std_transform=clipped_exp)

        actor_network = partial(
            ActorDistributionNetwork,
            fc_layer_params=fc_layer_params,
            continuous_projection_net_ctor=continuous_projection_net_ctor)

        critic_network = partial(CriticNetwork,
                                 output_tensor_spec=env.reward_spec(),
                                 joint_fc_layer_params=fc_layer_params)

        alg = SacAlgorithm(observation_spec=obs_spec,
                           action_spec=action_spec,
                           actor_network_cls=actor_network,
                           critic_network_cls=critic_network,
                           use_parallel_network=use_parallel_network,
                           use_entropy_reward=reward_dim == 1,
                           env=env,
                           config=config,
                           actor_optimizer=alf.optimizers.Adam(lr=1e-2),
                           critic_optimizer=alf.optimizers.Adam(lr=1e-2),
                           alpha_optimizer=alf.optimizers.Adam(lr=1e-2),
                           debug_summaries=False,
                           name="MySAC")

        eval_env.reset()
        for i in range(700):
            alg.train_iter()
            if i < config.initial_collect_steps:
                continue
            eval_env.reset()
            eval_time_step = unroll(eval_env, alg, steps_per_episode - 1)
            logging.log_every_n_seconds(
                logging.INFO,
                "%d reward=%f" % (i, float(eval_time_step.reward.mean())),
                n_seconds=1)

        self.assertAlmostEqual(1.0,
                               float(eval_time_step.reward.mean()),
                               delta=0.3)
示例#11
0
def _create_algorithm(env, sac, use_rnn, on_policy, priority_replay):
    observation_spec = env.observation_spec()
    action_spec = env.action_spec()
    fc_layer_params = (16, 16)
    continuous_projection_net_ctor = functools.partial(
        alf.networks.NormalProjectionNetwork,
        state_dependent_std=True,
        scale_distribution=True,
        std_transform=clipped_exp)

    if use_rnn:
        if sac:
            actor_net = functools.partial(
                ActorDistributionRNNNetwork,
                fc_layer_params=fc_layer_params,
                lstm_hidden_size=(4, ),
                continuous_projection_net_ctor=continuous_projection_net_ctor)
        else:
            actor_net = functools.partial(ActorRNNNetwork,
                                          fc_layer_params=fc_layer_params,
                                          lstm_hidden_size=(4, ))
        critic_net = functools.partial(CriticRNNNetwork,
                                       joint_fc_layer_params=fc_layer_params,
                                       lstm_hidden_size=(4, ))
    else:
        if sac:
            actor_net = functools.partial(
                ActorDistributionNetwork,
                fc_layer_params=fc_layer_params,
                continuous_projection_net_ctor=continuous_projection_net_ctor)
        else:
            actor_net = functools.partial(ActorNetwork,
                                          fc_layer_params=fc_layer_params)

        critic_net = functools.partial(CriticNetwork,
                                       joint_fc_layer_params=fc_layer_params)

    config = TrainerConfig(root_dir="dummy",
                           unroll_length=2,
                           initial_collect_steps=12 * 128 * 5,
                           use_rollout_state=True,
                           mini_batch_length=1,
                           mini_batch_size=256,
                           num_updates_per_train_iter=1,
                           whole_replay_buffer_training=False,
                           clear_replay_buffer=False,
                           priority_replay=priority_replay,
                           debug_summaries=DEBUGGING,
                           summarize_grads_and_vars=DEBUGGING,
                           summarize_action_distributions=DEBUGGING)

    return SarsaAlgorithm(observation_spec=observation_spec,
                          action_spec=action_spec,
                          env=env,
                          config=config,
                          on_policy=on_policy,
                          ou_stddev=0.2,
                          ou_damping=0.5,
                          actor_network_ctor=actor_net,
                          critic_network_ctor=critic_net,
                          actor_optimizer=alf.optimizers.AdamTF(lr=5e-3),
                          critic_optimizer=alf.optimizers.AdamTF(lr=2e-2),
                          alpha_optimizer=alf.optimizers.AdamTF(lr=2e-2),
                          debug_summaries=DEBUGGING)