Exemplo n.º 1
0
model = Model(ac_space=ac_space,
              policy_network=network,
              ent_coef=0.0,
              vf_coef=0.5,
              max_grad_norm=0.5)

ckpt = tf.train.Checkpoint(model=model)
manager = tf.train.CheckpointManager(ckpt, '../models/PPO22', max_to_keep=None)
ckpt.restore(manager.latest_checkpoint)

obs = env.reset()

state = model.initial_state

episode_reward = 0
while True:
    if state is not None:
        actions, _, state, _ = model.step(obs)
    else:
        actions, _, _, _ = model.step(obs)

    obs, rew, done, _ = env.step(actions.numpy())
    episode_reward += rew
    env.render()

    time.sleep(1 / 24)
    if done:
        print(f'episode_reward={episode_reward}')
        episode_reward = 0
Exemplo n.º 2
0
 # try:
 # action = model(ob)
 # print(obs)
 # assert len(obs) == 1, len(obs)
 # adopted_obs = FootballEnv.convert_observations_static(
 #     original=obs[0], player=player,
 #     left_player_position=0,
 #     right_player_position=0,
 #     config=config,
 # )
 # a = self._action_to_list(player.take_action(adopted_obs))
 # for k, v in sorted(obs[0].items()): print(k, v)
 print(obs.shape)
 # actions = player.take_action(observations=obs)
 actions, values, states, neglogpacs = model.step(obs,
                                                  S=states,
                                                  M=dones)
 # action, _states = model.predict(obs)
 # assert 0, action
 assert len(actions) == 1, actions
 # while int(actions[0]._backend_action) >= NUM_ACTIONS:
 #     print(actions)
 #     actions = player.take_action(observations=obs)
 #     assert len(actions) == 1, actions
 # print(actions)
 # ACTION_TO_INDEX_MAP = {a:i for i, a in enumerate(DEFAULT_ACTION_SET)}
 # actions = [ACTION_TO_INDEX_MAP[a] for a in actions]
 obs, rew, done, _ = env.step(actions)
 logging.info(
     'Playing the game, step %d, action %s, rew %s, done %d', cnt,
     actions, rew, done)