def f(): config = ffa_competition_env() env = Wrapped_Env(**config["env_kwargs"]) env.observation_space = spaces.Box(0, 20, shape=(11, 11, 18), dtype=np.float32) # Add 3 random agents agents = [] for agent_id in range(3): # if agent_id == env.winner_id: # agents.append(TrainingAgent(config["agent"](agent_id, config["game_type"]))) # else: agents.append( SimpleAgent(config["agent"](agent_id, config["game_type"]))) agent_id += 1 agents.append( TrainingAgent(config["agent"](agent_id, config["game_type"]))) env.set_agents(agents) env.set_training_agent(agents[-1].agent_id) env.set_init_game_state(None) return env
def __init__(self, model_file, agent_id=0): config = ffa_competition_env() super().__init__(config["agent"](agent_id, config["game_type"])) self.agent_id = agent_id self.env = self.make_env(config) self.model = load_model(model_file) self.reset_tree()
def main(): # Print all possible environments in the Pommerman registry # Instantiate the environment DETERMINISTIC = False VISUALIZE = False if args.test: DETERMINISTIC = True VISUALIZE = True config = ffa_competition_env() env = Wrapped_Env(**config["env_kwargs"]) # env.seed(0) env.observation_space = spaces.Box(0, 20, shape=(11, 11, 18)) env.num_envs = 1 # Add 3 random agents agents = [] for agent_id in range(3): agents.append( SimpleAgent(config["agent"](agent_id, config["game_type"]))) agent_id += 1 # Add TensorforceAgent agents.append(TrainingAgent(config["agent"](agent_id, config["game_type"]))) env.set_agents(agents) env.set_training_agent(agents[-1].agent_id) env.set_init_game_state(None) # env = VecFrameStack(make_pommerman_env(env, 8, 0), 2) # print(env.reset()) policy = CnnPolicy # Model(policy=policy, # ob_space=env.observation_space, # ac_space=env.action_space, # nbatch_act=1, # nbatch_train=100, # nsteps=1000, # ent_coef=0.01, # vf_coef=0.5, # max_grad_norm=0.5) num_timesteps = 10000 learn(policy=policy, env=env, nsteps=800, nminibatches=4, lam=0.95, gamma=0.99, noptepochs=4, log_interval=1, ent_coef=.01, lr=lambda f: f * 2.5e-4, cliprange=lambda f: f * 0.1, total_timesteps=int(num_timesteps * 1.1))
all_actions = self.env.act(obs) obs, reward, done, _ = self.env.step(all_actions) episode_steps += 1 observations.append(obs) actions.append(all_actions) rewards.append(reward) dones.append(done) print('rollout %i/%i' % (i + 1, num_rollouts)) return np.array(observations), np.array( to_categorical( actions, self.env.action_space.n)), np.array(rewards), np.array(dones) # Instantiate the environment config = ffa_competition_env() env = Pomme(**config["env_kwargs"]) # Generate training data stimulator = Stimulator(env, config) observations, actions, rewards, dones = stimulator.stimulate( num_rollouts=initial_rollouts) np.save(train_data_path + train_data_obs, observations) np.save(train_data_path + train_data_labels, actions) np.save(train_data_path + train_data_reward, rewards) np.save(train_data_path + train_data_done, dones) #print(np.sum(training_data_labels, axis=0) / np.sum(training_data_labels))
def main(): # Print all possible environments in the Pommerman registry # Instantiate the environment DETERMINISTIC = False VISUALIZE = False if args.test: DETERMINISTIC = True VISUALIZE = True config = ffa_competition_env() env = Pomme(**config["env_kwargs"]) env.seed(0) # Create a Proximal Policy Optimization agent with open('ppo.json', 'r') as fp: agent = json.load(fp=fp) with open('mlp2_lstm_network.json', 'r') as fp: network = json.load(fp=fp) agent = Agent.from_spec( spec=agent, kwargs=dict( states=dict(type='float', shape=env.observation_space.shape), actions=dict(type='int', num_actions=env.action_space.n), network=network ) ) # Add 3 random agents agents = [] for agent_id in range(3): agents.append(SimpleAgent(config["agent"](agent_id, config["game_type"]))) # Add TensorforceAgent agent_id += 1 agents.append(TensorforceAgent(config["agent"](agent_id, config["game_type"]))) env.set_agents(agents) env.set_training_agent(agents[-1].agent_id) env.set_init_game_state(None) # Instantiate and run the environment for 5 episodes. if VISUALIZE: wrapped_env = WrappedEnv(env, True) else: wrapped_env = WrappedEnv(env) runner = Runner(agent=agent, environment=wrapped_env) rewards = [] episodes = [] def episode_finished(r): nonlocal episodes nonlocal rewards print("Finished episode {ep} after {ts} timesteps (reward: {reward})".format(ep=r.episode, ts=r.episode_timestep, reward=r.episode_rewards[-1])) if r.episode % 1000 == 0: agent.save_model(('./{}').format(EXPERIMENT_NAME), False) try: prev_data = pickle.load(open(EXPERIMENT_NAME, "rb")) prev_len = len(prev_data[0]) prev_data[0].extend(rewards) rewards = [] prev_data[1].extend(episodes) episodes = [] pickle.dump(prev_data, open(EXPERIMENT_NAME, "wb")) except (OSError, IOError) as e: pickle.dump([rewards, episodes], open(EXPERIMENT_NAME, "wb")) if r.episode_rewards[-1] >= 5: print() print() print() print("WINNER WINNER CHICKEN DINNER") episodes.append(r.episode) rewards.append(r.episode_rewards[-1]) return True # Restore, Train, and Save Model if args.test or args.resume: # If test, change settings and restore model agent.restore_model('./','PPO_K_someS_500batch_biggerreward_99dis') runner.run(episodes=EPISODES, max_episode_timesteps=2000, episode_finished=episode_finished, deterministic=False) if not args.test: agent.save_model(('./{}').format(EXPERIMENT_NAME), False) print("Stats: ", runner.episode_rewards[-5:], runner.episode_timesteps[-5:]) #Dump reward values try: prev_data = pickle.load(open(EXPERIMENT_NAME, "rb")) prev_len = len(prev_data[0]) prev_data[0].extend(rewards) prev_data[1].extend(episodes) print(episodes) pickle.dump(prev_data, open(EXPERIMENT_NAME, "wb")) except (OSError, IOError) as e: pickle.dump([rewards, episodes], open(EXPERIMENT_NAME, "wb")) try: runner.close() except AttributeError as e: pass