示例#1
0
    def test_actor_critic_continuous_policy(self):
        batch_size = 100
        steps_per_episode = 13
        env = PolicyUnittestEnv(batch_size,
                                steps_per_episode,
                                action_type=ActionType.Continuous)
        # We need to wrap env using TFPyEnvironment because the methods of env
        # has side effects (e.g, env._current_time_step can be changed)
        env = TFPyEnvironment(env)
        action_spec = env.action_spec()
        observation_spec = env.observation_spec()
        algorithm = ActorCriticAlgorithm(
            observation_spec=observation_spec,
            action_spec=action_spec,
            actor_network=ActorDistributionNetwork(observation_spec,
                                                   action_spec,
                                                   fc_layer_params=()),
            value_network=ValueNetwork(observation_spec, fc_layer_params=()),
            optimizer=tf.optimizers.Adam(learning_rate=1e-2))
        driver = OnPolicyDriver(env, algorithm, train_interval=2)
        eval_driver = OnPolicyDriver(env, algorithm, training=False)

        driver.run = tf.function(driver.run)

        t0 = time.time()
        driver.run(max_num_steps=2600 * batch_size)
        print("time=%s" % (time.time() - t0))

        env.reset()
        time_step, _ = eval_driver.run(max_num_steps=4 * batch_size)
        print("reward=%s" % tf.reduce_mean(time_step.reward))
        self.assertAlmostEqual(1.0,
                               float(tf.reduce_mean(time_step.reward)),
                               delta=5e-2)
示例#2
0
def create_algorithm(env, use_rnn=False, learning_rate=1e-1):
    observation_spec = env.observation_spec()
    action_spec = env.action_spec()

    if use_rnn:
        actor_net = ActorDistributionRnnNetwork(observation_spec,
                                                action_spec,
                                                input_fc_layer_params=(),
                                                output_fc_layer_params=(),
                                                lstm_size=(4, ))
        value_net = ValueRnnNetwork(observation_spec,
                                    input_fc_layer_params=(),
                                    output_fc_layer_params=(),
                                    lstm_size=(4, ))
    else:
        actor_net = ActorDistributionNetwork(observation_spec,
                                             action_spec,
                                             fc_layer_params=())
        value_net = ValueNetwork(observation_spec, fc_layer_params=())

    optimizer = tf.optimizers.Adam(learning_rate=learning_rate)

    ac_algorithm = ActorCriticAlgorithm(action_spec=action_spec,
                                        actor_network=actor_net,
                                        value_network=value_net,
                                        loss=PPOLoss(action_spec=action_spec,
                                                     gamma=1.0),
                                        optimizer=optimizer)
    return PPOAlgorithm(ac_algorithm)
示例#3
0
def create_algorithm(env, use_rnn=False, learning_rate=1e-1):
    observation_spec = env.observation_spec()
    action_spec = env.action_spec()

    if use_rnn:
        actor_net = ActorDistributionRnnNetwork(observation_spec,
                                                action_spec,
                                                input_fc_layer_params=(),
                                                output_fc_layer_params=(),
                                                lstm_size=(4, ))
        value_net = ValueRnnNetwork(observation_spec,
                                    input_fc_layer_params=(),
                                    output_fc_layer_params=(),
                                    lstm_size=(4, ))
    else:
        actor_net = ActorDistributionNetwork(
            observation_spec,
            action_spec,
            fc_layer_params=(),
            continuous_projection_net=StableNormalProjectionNetwork)
        value_net = ValueNetwork(observation_spec, fc_layer_params=())

    optimizer = tf.optimizers.Adam(learning_rate=learning_rate)

    return PPOAlgorithm(action_spec=action_spec,
                        actor_network=actor_net,
                        value_network=value_net,
                        loss=PPOLoss(action_spec=action_spec,
                                     gamma=1.0,
                                     debug_summaries=DEBUGGING),
                        optimizer=optimizer,
                        debug_summaries=DEBUGGING)
示例#4
0
def build_value_net(observation_space):
    fc_layesr = [7 * 7 * 64, 512]
    q_net = ValueNetwork(observation_space,
                         fc_layer_params=fc_layesr,
                         dropout_layer_params=[1, 3])

    return q_net
示例#5
0
文件: run_ppo.py 项目: adak32/bellman
def train_eval(
    # tensorboard files
    root_dir,
    # environment
    env_name="CartPole-v1",
    random_seed=0,
    # Params for collect
    num_environment_steps=100000,
    replay_buffer_capacity=1001,  # Per-environment
    # Params for eval
    num_eval_episodes=30,
    eval_interval=200,
    # Params for summaries
    summary_interval=50,
):
    tf.compat.v1.set_random_seed(random_seed)

    environment = TFPyEnvironment(suite_gym.load(env_name))
    evaluation_environment = TFPyEnvironment(suite_gym.load(env_name))

    actor_net = ActorDistributionNetwork(environment.observation_spec(),
                                         environment.action_spec(),
                                         fc_layer_params=(200, 100))
    value_net = ValueNetwork(environment.observation_spec(),
                             fc_layer_params=(200, 100))
    global_step = tf.compat.v1.train.get_or_create_global_step()

    agent = PPOClipAgent(  # should be closer to the paper than PPOAgent...
        environment.time_step_spec(),
        environment.action_spec(),
        optimizer=tf.compat.v1.train.AdamOptimizer(
        ),  # default None does not work
        actor_net=actor_net,
        value_net=value_net,
        importance_ratio_clipping=0.2,
        normalize_observations=False,
        normalize_rewards=False,
        use_gae=True,
        lambda_value=0.5,
        discount_factor=0.95,
        train_step_counter=global_step,
    )

    agent_trainer = OnPolicyModelFreeAgentTrainer(400)

    experiment_harness = ExperimentHarness(
        root_dir,
        environment,
        evaluation_environment,
        agent,
        agent_trainer,
        replay_buffer_capacity,
        num_environment_steps,
        summary_interval,
        eval_interval,
        num_eval_episodes,
        number_of_initial_random_policy_steps=0,
        use_tf_function=True,
    )
    experiment_harness.run()
示例#6
0
def _create_ac_algorithm():
    observation_spec = common.get_observation_spec()
    action_spec = common.get_action_spec()
    optimizer = tf.optimizers.Adam(learning_rate=5e-5)
    actor_net = ActorDistributionNetwork(observation_spec,
                                         action_spec,
                                         fc_layer_params=(8, ))
    value_net = ValueNetwork(observation_spec, fc_layer_params=(8, ))

    return ActorCriticAlgorithm(action_spec=action_spec,
                                actor_network=actor_net,
                                value_network=value_net,
                                loss_class=ActorCriticLoss,
                                optimizer=optimizer,
                                debug_summaries=True)
示例#7
0
    def __init__(
        self,
        time_step_spec: ts.TimeStep,
        action_spec: types.NestedTensorSpec,
        transition_model_type: TransitionModelType,
        num_hidden_layers_model: int,
        num_hidden_nodes_model: int,
        activation_function_model: Callable,
        ensemble_size: int,
        predict_state_difference: bool,
        epochs: int,
        training_batch_size: int,
        callbacks: List[tf.keras.callbacks.Callback],
        reward_model: RewardModel,
        initial_state_distribution_model: InitialStateDistributionModel,
        trajectory_sampler_type: TrajectorySamplerType,
        horizon: int,
        population_size: int,
        model_free_agent_type: ModelFreeAgentType,
        num_hidden_layers_agent: int,
        num_hidden_nodes_agent: int,
        activation_function_agent: Callable,
        model_free_training_iterations: int,
        debug_summaries: bool = False,
        train_step_counter: Optional[tf.Variable] = None,
    ):
        """
        Initializes the agent

        :param time_step_spec: A nest of tf.TypeSpec representing the time_steps.
        :param action_spec: A nest of BoundedTensorSpec representing the actions.
        :param transition_model_type: An indicator which of the available transition models
            should be used - list can be found in `TransitionModelType`. A component of the
            environment model that describes the transition dynamics.
        :param num_hidden_layers_model: A transition model parameter, used for constructing a neural
            network. A number of hidden layers in the neural network.
        :param num_hidden_nodes_model: A transition model parameter, used for constructing a neural
            network. A number of nodes in each hidden layer. Parameter is shared across all layers.
        :param activation_function_model: A transition model parameter, used for constructing a
            neural network. An activation function of the hidden nodes.
        :param ensemble_size: A transition model parameter, used for constructing a neural
            network. The number of networks in the ensemble.
        :param predict_state_difference: A transition model parameter, used for constructing a
            neural network. A boolean indicating whether transition model will be predicting a
            difference between current and a next state or the next state directly.
        :param epochs: A transition model parameter, used by Keras fit method. A number of epochs
            used for training the neural network.
        :param training_batch_size: A transition model parameter, used by Keras fit method. A
            batch size used for training the neural network.
        :param callbacks: A transition model parameter, used by Keras fit method. A list of Keras
            callbacks used for training the neural network.
        :param reward_model: A component of the environment model that describes the
            rewards. At the moment only pre-specified reward models are allowed, i.e. agent
            assumes reward function is known.
        :param initial_state_distribution_model: A component of the environment model that
            describes the initial state distribution (can be both deterministic or
            probabilistic). At the moment only pre-specified initial state distribution models
            are allowed, i.e. agent assumes initial state distribution is known.
        :param trajectory_sampler_type: An indicator which of the available trajectory samplers
            should be used - list can be found in `TrajectorySamplerType`. Trajectory sampler
            determines how predictions from an ensemble of neural networks that model the
            transition dynamics are sampled. Works only with ensemble type of transition models.
        :param horizon: A trajectory optimiser parameter. The number of steps taken in the
            environment in each virtual rollout.
        :param population_size: A trajectory optimiser parameter. The number of virtual rollouts
            that are simulated in each iteration during trajectory optimization.
        :param model_free_agent_type: Type of model-free agent, e.g. PPO or TRPO.
        :param num_hidden_layers_agent: A model-free agent parameter, used for constructing neural
            networks for actor and critic. A number of hidden layers in the neural network.
        :param num_hidden_nodes_agent: A model-free agent parameter, used for constructing neural
            networks for actor and critic. A number of nodes in each hidden layer. Parameter is
            shared across all layers.
        :param activation_function_agent: A model-free agent parameter, used for constructing a
            neural network. An activation function of the hidden nodes.
        :param model_free_training_iterations: Number of model-free training iterations per each
            train-call.
        :param debug_summaries: A bool; if true, subclasses should gather debug summaries.
        :param train_step_counter: An optional counter to increment every time the train op is run.
            Defaults to the global_step.
        """

        assert ensemble_size > 0, "ensemble_size must be an integer > 0"
        assert num_hidden_layers_agent >= 0
        if num_hidden_layers_agent > 0:
            assert num_hidden_nodes_agent > 0

        self._ensemble_size = ensemble_size
        observation_spec = time_step_spec.observation

        # trajectory sampler (meaningful only for ensemble models)
        trajectory_sampler: Optional[TrajectorySamplingStrategy] = None
        if transition_model_type in [
                TransitionModelType.DeterministicEnsemble,
                TransitionModelType.ProbabilisticEnsemble,
        ]:
            trajectory_sampler = build_trajectory_sampler_from_type(
                ensemble_size=ensemble_size,
                trajectory_sampler_type=trajectory_sampler_type,
                batch_size=population_size,
            )

        # transition dynamics model plus training spec
        transition_model, training_spec = build_transition_model_and_training_spec_from_type(
            observation_spec=observation_spec,
            action_spec=action_spec,
            transition_model_type=transition_model_type,
            num_hidden_layers=num_hidden_layers_model,
            num_hidden_nodes=num_hidden_nodes_model,
            activation_function=activation_function_model,
            ensemble_size=ensemble_size,
            predict_state_difference=predict_state_difference,
            epochs=epochs,
            training_batch_size=training_batch_size,
            callbacks=callbacks,
            trajectory_sampler=trajectory_sampler,
        )

        # model-free agent
        actor_net = ActorDistributionNetwork(
            input_tensor_spec=observation_spec,
            output_tensor_spec=action_spec,
            fc_layer_params=[num_hidden_nodes_agent] * num_hidden_layers_agent,
            activation_fn=activation_function_agent,
        )
        value_net = ValueNetwork(
            input_tensor_spec=observation_spec,
            fc_layer_params=[num_hidden_nodes_agent] * num_hidden_layers_agent,
            activation_fn=activation_function_agent,
        )
        if model_free_agent_type == ModelFreeAgentType.Ppo:
            model_free_agent = PPOClipAgent(  # the one normally used for experiments...
                time_step_spec=time_step_spec,
                action_spec=action_spec,
                actor_net=actor_net,
                value_net=value_net,
                optimizer=tf.compat.v1.train.AdamOptimizer(
                ),  # default None does not work...
            )
        elif model_free_agent_type == ModelFreeAgentType.Trpo:
            model_free_agent = TRPOAgent(
                time_step_spec=time_step_spec,
                action_spec=action_spec,
                actor_net=actor_net,
                value_net=value_net,
            )
        else:
            raise RuntimeError("Unknown or unsupported agent type")

        super().__init__(
            (transition_model, training_spec),
            reward_model,
            initial_state_distribution_model,
            model_free_agent,
            population_size,
            horizon,
            model_free_training_iterations,
            debug_summaries,
            train_step_counter,
        )
示例#8
0
from tf_agents.agents.ppo import ppo_agent
from tf_agents.networks.value_network import ValueNetwork

actor_net = actor_distribution_network.ActorDistributionNetwork(
    train_env.observation_spec(),
    train_env.action_spec(),
    fc_layer_params=actor_fc_layer_params)

value_net = ValueNetwork(train_env.observation_spec())

global_step = tf.compat.v2.Variable(0)
tf_agent = ppo_agent.PPOAgent(train_env.time_step_spec(),
                              train_env.action_spec(),
                              actor_net=actor_net,
                              value_net=value_net,
                              optimizer=tf.compat.v1.train.AdamOptimizer(
                                  learning_rate=actor_learning_rate),
                              train_step_counter=global_step)
示例#9
0
        continuous_projection_net=tanh_normal_projection_network.TanhNormalProjectionNetwork,
        name='ActorDistributionNetwork'
    )
    print('Actor Network Created.')

    # Critic Network
    critic_net = ValueNetwork(
        observation_spec,
        preprocessing_layers={
            'observation_market': kr.models.Sequential([
                kr.layers.Conv2D(filters=int((observation_spec['observation_market'].shape[0]*observation_spec['observation_market'].shape[1])//100), kernel_size=3, activation='relu', input_shape=(observation_spec['observation_market'].shape[0], observation_spec['observation_market'].shape[1], 1)),
                kr.layers.Conv2D(filters=int((observation_spec['observation_market'].shape[0]*observation_spec['observation_market'].shape[1])//100), kernel_size=3, activation='relu', input_shape=(observation_spec['observation_market'].shape[0], observation_spec['observation_market'].shape[1], 1)),
                kr.layers.Flatten(),
                kr.layers.Dense(5, activation='tanh'),
                kr.layers.Flatten()
            ]),
            # 'observation_market': kr.layers.Conv2D(filters=int((observation_spec['observation_market'].shape[0]*observation_spec['observation_market'].shape[1])//100), kernel_size=3, activation='relu', input_shape=(observation_spec['observation_market'].shape[0], observation_spec['observation_market'].shape[1], 1)),
            'observation_holdingRate': kr.layers.Dense(2, activation='tanh')
        },
        preprocessing_combiner=kr.layers.Concatenate(axis=-1),
        conv_layer_params=None,
        fc_layer_params=critic_commonDenseLayerParams,
        dtype=tf.float32,
        name='Critic Network'
    )

    # Agent
    # https://www.tensorflow.org/agents/api_docs/python/tf_agents/agents/ReinforceAgent
    global_step = tf.compat.v1.train.get_or_create_global_step()
    if shouldContinueFromLastCheckpoint:
        global_step = tf.compat.v1.train.get_global_step()
示例#10
0
def create_ac_algorithm(env,
                        actor_fc_layers=(200, 100),
                        value_fc_layers=(200, 100),
                        encoding_conv_layers=(),
                        encoding_fc_layers=(),
                        use_rnns=False,
                        use_icm=False,
                        learning_rate=5e-5,
                        algorithm_class=ActorCriticAlgorithm,
                        loss_class=ActorCriticLoss,
                        debug_summaries=False):
    """Create a simple ActorCriticAlgorithm.

    Args:
        env (TFEnvironment): A TFEnvironment
        actor_fc_layers (list[int]): list of fc layers parameters for actor network
        value_fc_layers (list[int]): list of fc layers parameters for value network
        encoding_conv_layers (list[int]): list of convolution layers parameters for encoding network
        encoding_fc_layers (list[int]): list of fc layers parameters for encoding network
        use_rnns (bool): True if rnn should be used
        use_icm (bool): True if intrinsic curiosity module should be used
        learning_rate (float): learning rate
        algorithm_class (type): class of the algorithm. Can be
            ActorCriticAlgorithm or PPOAlgorithm
        loss_class (type): the class of the loss. The signature of its
            constructor: loss_class(action_spec, debug_summaries)
        debug_summaries (bool): True if debug summaries should be created.
    """
    optimizer = tf.optimizers.Adam(learning_rate=learning_rate)

    if use_rnns:
        actor_net = ActorDistributionRnnNetwork(
            env.observation_spec(),
            env.action_spec(),
            input_fc_layer_params=actor_fc_layers,
            output_fc_layer_params=None)
        value_net = ValueRnnNetwork(env.observation_spec(),
                                    input_fc_layer_params=value_fc_layers,
                                    output_fc_layer_params=None)
    else:
        actor_net = ActorDistributionNetwork(env.observation_spec(),
                                             env.action_spec(),
                                             fc_layer_params=actor_fc_layers)
        value_net = ValueNetwork(env.observation_spec(),
                                 fc_layer_params=value_fc_layers)

    encoding_net = None
    if encoding_fc_layers or encoding_conv_layers:
        encoding_net = EncodingNetwork(
            input_tensor_spec=env.observation_spec(),
            conv_layer_params=encoding_conv_layers,
            fc_layer_params=encoding_fc_layers)

    icm = None
    if use_icm:
        feature_spec = env.observation_spec()
        if encoding_net:
            feature_spec = tf.TensorSpec((encoding_fc_layers[-1], ),
                                         dtype=tf.float32)
        icm = ICMAlgorithm(env.action_spec(),
                           feature_spec,
                           encoding_net=encoding_net)

    algorithm = algorithm_class(action_spec=env.action_spec(),
                                actor_network=actor_net,
                                value_network=value_net,
                                intrinsic_curiosity_module=icm,
                                loss_class=loss_class,
                                optimizer=optimizer,
                                debug_summaries=debug_summaries)

    return algorithm
示例#11
0
def train_eval(
    # harness
    # tensorboard files
    root_dir,
    # Params for collect
    num_environment_steps,
    # Params for eval
    num_eval_episodes,
    eval_interval,
    # Params for summaries
    summary_interval,
    # environment
    env_name,
    gym_random_seed,
    # agent
    random_seed,
    num_hidden_layers_agent,
    num_hidden_nodes_agent,
    discount_factor,
    lambda_value,
    max_kl,
    backtrack_coefficient,
    backtrack_iters,
    cg_iters,
    reward_normalizer,
    reward_norm_clipping,
    log_prob_clipping,
    value_train_iters,
    value_optimizer,
    gradient_clipping,
    debug,
    # agent trainer
    steps_per_policy_update,
    # agent specific harness parameters
    replay_buffer_capacity,
    use_tf_function,
):
    """
    This function will train and evaluate a TRPO agent.

    :param root_dir: Root directory where all experiments are stored.
    :param num_environment_steps: The number of environment steps to run the
            experiment for.
    :param num_eval_episodes: Number of episodes at each evaluation point.
    :param eval_interval: Interval for evaluation points.
    :param summary_interval: Interval for summaries.
    :param env_name: Name for the environment to load.
    :param gym_random_seed: Value to use as seed for the environment.
    :param random_seed: A component of the environment model that describes the
            rewards. At the moment only pre-specified reward models are allowed, i.e. agent
            assumes reward function is known.
    :param num_hidden_layers_agent: A model-free agent parameter, used for constructing neural
            networks for actor and critic. A number of hidden layers in the neural network.
    :param num_hidden_nodes_agent: A model-free agent parameter, used for constructing neural
            networks for actor and critic. A number of nodes in each hidden layer. Parameter is
            shared across all layers.
    :param discount_factor: discount factor in [0, 1]
    :param lambda_value: trace decay used by the GAE critic in [0, 1]
    :param max_kl: maximum KL distance between updated and old policy
    :param backtrack_coefficient: coefficient used in step size search
    :param backtrack_iters: number of iterations to performa in line search
    :param cg_iters: number of conjugate gradient iterations to approximate natural gradient
    :param reward_normalizer: TensorNormalizer applied to rewards
    :param reward_norm_clipping: value to clip rewards
    :param log_prob_clipping: clip value for log probs in policy gradient , None for no clipping
    :param value_train_iters: number of gradient steps to perform on value estimator
            for every policy update
    :param value_optimizer: optimizer used to train value_function (default: Adam)
    :param gradient_clipping: clip born value gradient (None for no clipping)
    :param debug: debug flag to check computations for Nans
    :param steps_per_policy_update: steps between policy updates
    :param replay_buffer_capacity: Capacity of the buffer collecting real samples.
    :param use_tf_function: If `True`, use a `tf.function` for data collection.
    """
    tf.compat.v1.set_random_seed(random_seed)

    environment = create_real_tf_environment(env_name, gym_random_seed)
    evaluation_environment = create_real_tf_environment(
        env_name, gym_random_seed)

    network_architecture = (num_hidden_nodes_agent, ) * num_hidden_layers_agent
    actor_net = ActorDistributionNetwork(
        environment.observation_spec(),
        environment.action_spec(),
        fc_layer_params=network_architecture,
    )
    value_net = ValueNetwork(environment.observation_spec(),
                             fc_layer_params=network_architecture)
    global_step = tf.compat.v1.train.get_or_create_global_step()

    agent = TRPOAgent(
        environment.time_step_spec(),
        environment.action_spec(),
        actor_net,
        value_net,
        discount_factor,
        lambda_value,
        max_kl,
        backtrack_coefficient,
        backtrack_iters,
        cg_iters,
        reward_normalizer,
        reward_norm_clipping,
        log_prob_clipping,
        value_train_iters,
        value_optimizer,
        gradient_clipping,
        debug,
        train_step_counter=global_step,
    )

    agent_trainer = OnPolicyModelFreeAgentTrainer(steps_per_policy_update)

    experiment_harness = ExperimentHarness(
        root_dir,
        environment,
        evaluation_environment,
        agent,
        agent_trainer,
        replay_buffer_capacity,
        num_environment_steps,
        summary_interval,
        eval_interval,
        num_eval_episodes,
        number_of_initial_random_policy_steps=0,
        use_tf_function=use_tf_function,
    )
    experiment_harness.run()
示例#12
0
    def __init__(self,
                 observation_spec,
                 action_spec,
                 memory: MemoryWithUsage,
                 num_read_keys=1,
                 lstm_size=(256, 256),
                 latent_dim=200,
                 loss=None,
                 loss_weight=1.0,
                 name="mba"):
        """Create the policy module of MERLIN.

        Args:
            action_spec (nested BoundedTensorSpec): representing the actions.
            memory (MemoryWithUsage): the memory module from MemoryBasedPredictor
            num_read_keys (int): number of keys for reading memory.
            latent_dim (int): the dimension of the hidden representation of VAE.
            lstm_size (list[int]): size of lstm layers
            loss (None|ActorCriticLoss): an object for calculating the loss
                for reinforcement learning. If None, a default ActorCriticLoss
                will be used.
            name (str): name of the algorithm.
        """
        # This is different from Merlin LSTM. This rnn only uses the output
        # from ths last LSTM layer, while Merlin uses outputs from all LSTM
        # layers
        rnn = make_lstm_cell(lstm_size, name=name + "/lstm")

        actor_input_dim = (latent_dim + lstm_size[-1] +
                           num_read_keys * memory.dim)

        actor_net = ActorDistributionNetwork(
            input_tensor_spec=TensorSpec((actor_input_dim, ),
                                         dtype=tf.float32),
            output_tensor_spec=action_spec,
            fc_layer_params=(200, ),
            activation_fn=tf.keras.activations.tanh,
            name=name + "/actor_net")

        super(MemoryBasedActor,
              self).__init__(observation_spec=observation_spec,
                             action_spec=action_spec,
                             train_state_spec=get_rnn_cell_state_spec(rnn),
                             name=name)

        self._loss = ActorCriticLoss(action_spec) if loss is None else loss
        self._loss_weight = loss_weight
        self._memory = memory

        self._key_net = tf.keras.layers.Dense(num_read_keys *
                                              (self._memory.dim + 1),
                                              name=name + "/key_net")

        # TODO: add log p(a_i) as input to value net
        value_input_dim = latent_dim
        self._value_net = ValueNetwork(input_tensor_spec=TensorSpec(
            (value_input_dim, ), dtype=tf.float32),
                                       fc_layer_params=(200, ),
                                       activation_fn=tf.keras.activations.tanh,
                                       name=name + "/value_net")

        self._rnn = rnn
        self._actor_net = actor_net
示例#13
0
def make_networks(env,
                  size=(96, 96),
                  num_frames=1,
                  num_channels=3,
                  conv_params=[(16, 8, 4), (32, 3, 2)],
                  in_fc_params=(256, ),
                  out_fc_params=(128, ),
                  use_lstm=False,
                  lstm_size=(256, )):
    """ Creates the actor and critic neural networks of the PPO agent.

    Function for creating the neural networks for the PPO agent, namely the
    actor and value networks.

    Source for network params:
    https://www.arconsis.com/unternehmen/blog/reinforcement-learning-doom-with-tf-agents-and-ppo

    Arguments:
        env (TfPyEnvironment): A TensorFlow environment the agent interacts with.
        size (tuple):  The desired width and height of the observation space.
            Defaults to (96, 96).  Input tuple should preserve the original
            observation aspect ratio.
        num_frames (int):  Number of frames used in the agent's observation.
            Defaults to 1, num_frames > 1 indicates frame stacking.
        num_channels (int):  Number of color channels to include for each frame.
            Defaults to 3 (RGB), and 1 denotes grayscale.
        conv_params (list): A list corresponding to convolutional layer
            parameters for the PPO agent's actor and critic neural networks.
        in_fc_params (tuple): The number of neurons in the input fully
            connected layer of the actor and critic networks of the agent.
        out_fc_params (tuple): The number of neurons in the output fully
            connected layer of the actor and critic networks of the agent.
        use_lstm (bool):  Whether to use LSTM-based actor and critic networks.
        lstm_size (tuple): The number of hidden states inside the LSTM for the
            actor and critic networks of the agents.

    Returns:
        actor_net (ActorDistributionNetwork): A tf-agents Actor Distribution
            Network used for PPO agent action selection.
        value_net (ValueNetwork): A tf-agents Value Network used for
            PPO agent value estimation.
    """
    # Restructure time step spec to match expected processed observations
    processed_shape = tuple(size + (num_channels * num_frames, ))
    obs_spec = env.observation_spec()  # Get old observation spec
    obs_spec = tensor_spec.BoundedTensorSpec(processed_shape,
                                             obs_spec.dtype,
                                             minimum=obs_spec.minimum,
                                             maximum=obs_spec.maximum,
                                             name=obs_spec.name)
    if use_lstm:  # LSTM-based policies
        # Define actor network
        actor_net = ActorDistributionRnnNetwork(
            obs_spec,
            env.action_spec(),
            conv_layer_params=conv_params,
            input_fc_layer_params=in_fc_params,
            lstm_size=lstm_size,
            output_fc_layer_params=out_fc_params)
        # Define value network
        value_net = ValueRnnNetwork(obs_spec,
                                    conv_layer_params=conv_params,
                                    input_fc_layer_params=in_fc_params,
                                    lstm_size=lstm_size,
                                    output_fc_layer_params=out_fc_params)

        print("Created Actor and Value Networks with LSTM...")

    else:  # non-LSTM-based policies
        # Define actor network
        actor_net = ActorDistributionNetwork(obs_spec,
                                             env.action_spec(),
                                             conv_layer_params=conv_params)
        # Define value network
        value_net = ValueNetwork(obs_spec, conv_layer_params=conv_params)

    return actor_net, value_net
    eval_env = TFPyEnvironment(eval_py_env)
    train_env = TFPyEnvironment(
        ParallelPyEnvironment([get_env] * 4, start_serially=False))

    # Create a global step
    global_step = tf.compat.v1.train.get_or_create_global_step()

    # Create the actor network (with the normal distribution)
    actor_net = ActorDistributionNetwork(
        input_tensor_spec=train_env.observation_spec(),
        output_tensor_spec=train_env.action_spec(),
        fc_layer_params=(128, 256, 512, 512, 256),
        continuous_projection_net=normal_net)

    # Create the value network
    value_net = ValueNetwork(input_tensor_spec=train_env.observation_spec(),
                             fc_layer_params=(256, 512, 512))

    # Create the PPO agent
    ppo_agent = PPOClipAgent(time_step_spec=train_env.time_step_spec(),
                             action_spec=train_env.action_spec(),
                             optimizer=Adam(learning_rate=5e-4),
                             actor_net=actor_net,
                             value_net=value_net,
                             importance_ratio_clipping=0.2,
                             discount_factor=0.95,
                             entropy_regularization=0.0,
                             num_epochs=16,
                             use_gae=True,
                             use_td_lambda_return=True,
                             log_prob_clipping=3,
                             gradient_clipping=0.5,