示例#1
0
ray.init()
results = {}
N = 100
config["num_workers"] = 1
config["num_gpus"] = 0

# You may have to run each agents in separate sessions
# to avoid PyBullet restrictions
agent = "ALP-GMM"
# agent = "Manual"
# agent = "No Curriculum"

print(f"Evaluating agent: {agent}")
results[agent] = []
trainer = PPOTrainer(config=config, env=envs[agent])
trainer.restore(agents[agent])
env = envs[agent](dict(config["env_config"], **{"in_training": False}))
for i in range(N):
    print(agent, i)
    done = False
    obs = env.reset()
    ep_reward = 0
    while not done:
        action = trainer.compute_action(obs)
        obs, reward, done, info = env.step(action)
        ep_reward += reward
        if done:
            obs = env.reset()
            results[agent].append(ep_reward)
print(f"Agent {agent} score: {np.round(np.mean(results[agent]), 2)}")
trainer.restore(
    # Replace this with your checkpoint path.
    "/home/enes/ray_results/PPO_InventoryEnv_2020-10-06_04-58-04t8r36o9o/checkpoint_781/checkpoint-781"
)

if __name__ == "__main__":
    np.random.seed(0)
    env = InventoryEnv()
    episode_reward_avgs = []
    episode_total_rewards = []
    for i in range(2000):
        print(f"Episode: {i+1}")
        state = env.reset()
        done = False
        ep_rewards = []
        while not done:
            action = trainer.compute_action(state)
            state, reward, done, info = env.step(action)
            ep_rewards.append(reward)
        total_reward = np.sum(ep_rewards)
        reward_per_day = np.mean(ep_rewards)
        print(f"Total reward: {total_reward}")
        print(f"Reward per time step: {reward_per_day}")
        episode_reward_avgs.append(reward_per_day)
        episode_total_rewards.append(total_reward)
        print(
            f"Average daily reward over {len(episode_reward_avgs)} "
            f"test episodes: {np.mean(episode_reward_avgs)}. "
            f"Average total epsisode reward: {np.mean(episode_total_rewards)}"
        )
示例#3
0
 ray.init()
 num_policies = 4
 policies = {
     "policy_{}".format(i): (None, env.observation_space, env.action_space, {})
     for i in range(num_policies)
 }
 policy_ids = list(policies.keys())
 config = {
     "multiagent": {
         "policies": policies,
         "policy_mapping_fn": (lambda agent_id: random.choice(policy_ids)),
     },
     "framework": "tf",
 }
 #trainer = ApexTrainer(env=TicTacToe, config=config)
 trainer = PPOTrainer(env=TicTacToe, config=config)
 trainer.restore("ttt_model/checkpoint_51/checkpoint-51")
 obs = env.reset()
 print(obs)
 done = False
 while not done:
     env.render()
     player = list(obs)[0]
     if player == "X":
         action = int(input(f"Player {player} - enter action 1-9:")) - 1
     else:
         action = trainer.compute_action(np.array(obs["O"]), policy_id="policy_1")
     obs, rewards, dones, infos = env.step({player: action})
     done = dones["__all__"]
     print(obs, rewards, dones, infos)
 env.render()
示例#4
0
    steps = 0
    draw = 0
    wins = 0
    losses = 0
    # Select no. of episodes
    n_episodes = 100
    avg_rew = 0
    total_rew = 0

    #  Play game with trained agent
    for i_episode in range(1, n_episodes + 1):
        obs = env.reset()
        episode_rew = 0
        while True:
            # Sample action from trained policy
            action = ppo_trainer.compute_action(obs, policy_id='policy_01')
            action_dict[str(int(action))] += 1
            obs, rew, done, info = env.step(action)
            episode_rew += rew

            if done:
                break

        if episode_rew >= 0 and episode_rew <= 1.0:
            draw += 1
        if episode_rew > 1.0:
            wins += 1
        total_rew += episode_rew

        print("Episode:%d Reward:%.2f" % (i_episode, episode_rew))
    avg_rew = total_rew / i_episode