def simple_actor_critic(hidden_sizes=(32, 32),
                        activation='relu',
                        activation_output=None,
                        kernel_initalizer='glorot_uniform',
                        name='simple_actor_critic',
                        env_info=EnvInfo):

    _actor = mlp(hidden_sizes=hidden_sizes,
                 output_size=env_info.act_size,
                 activation=activation,
                 activation_output=activation_output,
                 name=name,
                 kernel_initalizer=kernel_initalizer)

    _critic = mlp(hidden_sizes=hidden_sizes,
                  output_size=1,
                  activation=activation,
                  activation_output=activation_output,
                  name=name,
                  kernel_initalizer=kernel_initalizer)

    log('Model Summary: ' + name)

    _actor.build(input_shape=(None, ) + env_info.shapes['vec'])
    _actor.summary()

    _critic.build(input_shape=(None, ) + env_info.shapes['vec'])
    _critic.summary()

    def forward(inp=None):
        logits = _actor(inp['vec_obs'])
        values = _critic(inp['vec_obs'])
        return logits, values

    return {"forward": forward, "trainable_networks": [_actor, _critic]}
def vis_vec_actor_critic(hidden_sizes=(32, 32),
                         activation='relu',
                         activation_output=None,
                         kernel_initalizer='glorot_uniform',
                         name='vis_vec_actor_critic',
                         env_info=EnvInfo):

    cnn, out_units = cnn_simple()

    _mlp_actor = mlp(hidden_sizes=hidden_sizes,
                     output_size=env_info.act_size,
                     activation=activation,
                     activation_output=activation_output,
                     name=name,
                     kernel_initalizer=kernel_initalizer)

    _mlp_critic = mlp(hidden_sizes=hidden_sizes,
                      output_size=1,
                      activation=activation,
                      activation_output=activation_output,
                      name=name,
                      kernel_initalizer=kernel_initalizer)

    log('Model Summary: ' + name)

    cnn.build(input_shape=(None, ) + env_info.shapes['vis'])
    cnn.summary()
    _mlp_actor.build(input_shape=(None, env_info.shapes['vec'][0] + out_units))
    _mlp_actor.summary()
    _mlp_critic.build(input_shape=(None,
                                   env_info.shapes['vec'][0] + out_units))
    _mlp_critic.summary()

    def forward(inp=None):
        out_cnn = cnn(inp['vis_obs'])  # Convolutional Network for visuals
        # out_vec_mlp = _mlp_vec(inp['vec_obs])                     # Put vec_obs thorugh Neural Network if to much features
        mixed = tf.concat([out_cnn, inp['vec_obs']],
                          -1)  # Concatenate cnn and vec_obs or out_vec_mlp
        # out_mixer_mlp = _mlp_mixer(mixed)                         # state mixer with Neural Network if needed
        logits = _mlp_actor(mixed)  # Feed with raw mixed or with out_mixer_mlp
        values = _mlp_critic(mixed)
        return logits, values

    return {
        "forward": forward,
        "trainable_networks": [cnn, _mlp_actor, _mlp_critic]
    }
def cnn_simple_actor_critic(hidden_sizes=(32, 32),
                            activation='relu',
                            activation_output=None,
                            kernel_initalizer='glorot_uniform',
                            name='cnn_simple_actor_critic',
                            env_info=EnvInfo):

    cnn, _ = cnn_simple()

    _actor = tf.keras.Sequential(name='actor')
    _critic = tf.keras.Sequential(name='critic')
    _actor.add(cnn)
    _critic.add(cnn)

    _mlp_actor = mlp(hidden_sizes=hidden_sizes,
                     output_size=env_info.act_size,
                     activation=activation,
                     activation_output=activation_output,
                     name=name,
                     kernel_initalizer=kernel_initalizer)

    _actor.add(_mlp_actor)

    _mlp_critic = mlp(hidden_sizes=hidden_sizes,
                      output_size=1,
                      activation=activation,
                      activation_output=activation_output,
                      name=name,
                      kernel_initalizer=kernel_initalizer)

    _critic.add(_mlp_critic)

    log('Model Summary: ' + name)
    _actor.build(input_shape=(None, ) + env_info.shapes['vis'])
    _actor.summary()
    _critic.build(input_shape=(None, ) + env_info.shapes['vis'])
    _critic.summary()

    def forward(inp=None):
        logits = _actor(inp['vis_obs'])
        values = _critic(inp['vis_obs'])
        return logits, values

    return {"forward": forward, "trainable_networks": [_actor, _critic]}
    def update(self, rollouts):
        """
            Update Policy and the Value Network
            -----------------------------------
                Inputs: obs, act, advantages, returns, logp-t
                Returns: loss-pi, loss-entropy, approx-ent, kl, loss-v, loss-total
        """
        inds = np.arange(self.nbatch)

        for i in range(self.train_iters):

            losses = self._inner_update_loop(rollouts['vec_obses'],
                                             rollouts['vis_obses'],
                                             rollouts['actions'],
                                             rollouts['advs'],
                                             rollouts['returns'],
                                             rollouts['logp'], inds)

            if losses['approx_kl'] > 1.5 * self.target_kl:
                log("Early stopping at step %d due to reaching max kl." % i)
                break

        return losses  # Return Metrics
Exemplo n.º 5
0
tf.random.set_seed(params.env.seed)                                     # Set Random Seeds for np and tf
np.random.seed(params.env.seed)

env = BitcoinTradingEnv(data)    # Create Environment in multiprocessing mode
LOGGER = Logger('academy_name', os.getcwd(), config)         # Set Logger

# network = network_builder(params.trainer.nn_architecure) \
#     (hidden_sizes=params.policy.hidden_sizes, env_info=env.env_info)    # Build Neural Network with Forward Pass

network = simple_actor_critic(env_info = env_test, hidden_sizes = (128, 128))   # Our own version of the simple_actor_critic that fits with the environment

model = CategoricalModel_2
model = model(network=network, env_info=env_test)                   # Build Model for Discrete or Continuous Spaces

if params.trainer.load_model:
    log('Loading Model ...')
    model.load_weights(LOGGER.tf_model_path('model_weights'))           # Load model if load_model flag set to true

roller = Roller(env_test, model, params.trainer.steps_per_epoch,
                params.trainer.gamma, params.trainer.lam)               # Define Roller for genrating rollouts for training

ppo = PolicyCombinedLoss(model=model, num_envs=1)            # Define PPO Policy with combined loss

for epoch in range(params.trainer.epochs):                              # Main training loop for n epochs

    rollouts, infos = roller.rollout()                                  # Get Rollout and infos
    outs = ppo.update(rollouts)                                         # Push rollout in ppo and update policy accordingly

    LOGGER.store('Loss Pi', outs['pi_loss'])
    LOGGER.store('Loss V', outs['v_loss'])
    LOGGER.store('Loss Ent', outs['entropy_loss'])