ray.init() results = {} N = 100 config["num_workers"] = 1 config["num_gpus"] = 0 # You may have to run each agents in separate sessions # to avoid PyBullet restrictions agent = "ALP-GMM" # agent = "Manual" # agent = "No Curriculum" print(f"Evaluating agent: {agent}") results[agent] = [] trainer = PPOTrainer(config=config, env=envs[agent]) trainer.restore(agents[agent]) env = envs[agent](dict(config["env_config"], **{"in_training": False})) for i in range(N): print(agent, i) done = False obs = env.reset() ep_reward = 0 while not done: action = trainer.compute_action(obs) obs, reward, done, info = env.step(action) ep_reward += reward if done: obs = env.reset() results[agent].append(ep_reward) print(f"Agent {agent} score: {np.round(np.mean(results[agent]), 2)}")
trainer.restore( # Replace this with your checkpoint path. "/home/enes/ray_results/PPO_InventoryEnv_2020-10-06_04-58-04t8r36o9o/checkpoint_781/checkpoint-781" ) if __name__ == "__main__": np.random.seed(0) env = InventoryEnv() episode_reward_avgs = [] episode_total_rewards = [] for i in range(2000): print(f"Episode: {i+1}") state = env.reset() done = False ep_rewards = [] while not done: action = trainer.compute_action(state) state, reward, done, info = env.step(action) ep_rewards.append(reward) total_reward = np.sum(ep_rewards) reward_per_day = np.mean(ep_rewards) print(f"Total reward: {total_reward}") print(f"Reward per time step: {reward_per_day}") episode_reward_avgs.append(reward_per_day) episode_total_rewards.append(total_reward) print( f"Average daily reward over {len(episode_reward_avgs)} " f"test episodes: {np.mean(episode_reward_avgs)}. " f"Average total epsisode reward: {np.mean(episode_total_rewards)}" )
ray.init() num_policies = 4 policies = { "policy_{}".format(i): (None, env.observation_space, env.action_space, {}) for i in range(num_policies) } policy_ids = list(policies.keys()) config = { "multiagent": { "policies": policies, "policy_mapping_fn": (lambda agent_id: random.choice(policy_ids)), }, "framework": "tf", } #trainer = ApexTrainer(env=TicTacToe, config=config) trainer = PPOTrainer(env=TicTacToe, config=config) trainer.restore("ttt_model/checkpoint_51/checkpoint-51") obs = env.reset() print(obs) done = False while not done: env.render() player = list(obs)[0] if player == "X": action = int(input(f"Player {player} - enter action 1-9:")) - 1 else: action = trainer.compute_action(np.array(obs["O"]), policy_id="policy_1") obs, rewards, dones, infos = env.step({player: action}) done = dones["__all__"] print(obs, rewards, dones, infos) env.render()
steps = 0 draw = 0 wins = 0 losses = 0 # Select no. of episodes n_episodes = 100 avg_rew = 0 total_rew = 0 # Play game with trained agent for i_episode in range(1, n_episodes + 1): obs = env.reset() episode_rew = 0 while True: # Sample action from trained policy action = ppo_trainer.compute_action(obs, policy_id='policy_01') action_dict[str(int(action))] += 1 obs, rew, done, info = env.step(action) episode_rew += rew if done: break if episode_rew >= 0 and episode_rew <= 1.0: draw += 1 if episode_rew > 1.0: wins += 1 total_rew += episode_rew print("Episode:%d Reward:%.2f" % (i_episode, episode_rew)) avg_rew = total_rew / i_episode