def f():
            config = ffa_competition_env()
            env = Wrapped_Env(**config["env_kwargs"])
            env.observation_space = spaces.Box(0,
                                               20,
                                               shape=(11, 11, 18),
                                               dtype=np.float32)

            # Add 3 random agents
            agents = []
            for agent_id in range(3):
                # if agent_id == env.winner_id:
                #     agents.append(TrainingAgent(config["agent"](agent_id, config["game_type"])))
                # else:
                agents.append(
                    SimpleAgent(config["agent"](agent_id,
                                                config["game_type"])))
            agent_id += 1
            agents.append(
                TrainingAgent(config["agent"](agent_id, config["game_type"])))

            env.set_agents(agents)
            env.set_training_agent(agents[-1].agent_id)
            env.set_init_game_state(None)
            return env
예제 #2
0
 def __init__(self, model_file, agent_id=0):
     config = ffa_competition_env()
     super().__init__(config["agent"](agent_id, config["game_type"]))
     self.agent_id = agent_id
     self.env = self.make_env(config)
     self.model = load_model(model_file)
     self.reset_tree()
def main():
    # Print all possible environments in the Pommerman registry
    # Instantiate the environment
    DETERMINISTIC = False
    VISUALIZE = False

    if args.test:
        DETERMINISTIC = True
        VISUALIZE = True

    config = ffa_competition_env()
    env = Wrapped_Env(**config["env_kwargs"])
    # env.seed(0)
    env.observation_space = spaces.Box(0, 20, shape=(11, 11, 18))
    env.num_envs = 1

    # Add 3 random agents
    agents = []
    for agent_id in range(3):
        agents.append(
            SimpleAgent(config["agent"](agent_id, config["game_type"])))

    agent_id += 1

    # Add TensorforceAgent
    agents.append(TrainingAgent(config["agent"](agent_id,
                                                config["game_type"])))
    env.set_agents(agents)
    env.set_training_agent(agents[-1].agent_id)
    env.set_init_game_state(None)

    # env = VecFrameStack(make_pommerman_env(env, 8, 0), 2)

    # print(env.reset())

    policy = CnnPolicy

    # Model(policy=policy,
    #            ob_space=env.observation_space,
    #            ac_space=env.action_space,
    #            nbatch_act=1,
    #            nbatch_train=100,
    #            nsteps=1000,
    #            ent_coef=0.01,
    #            vf_coef=0.5,
    #            max_grad_norm=0.5)
    num_timesteps = 10000

    learn(policy=policy,
          env=env,
          nsteps=800,
          nminibatches=4,
          lam=0.95,
          gamma=0.99,
          noptepochs=4,
          log_interval=1,
          ent_coef=.01,
          lr=lambda f: f * 2.5e-4,
          cliprange=lambda f: f * 0.1,
          total_timesteps=int(num_timesteps * 1.1))
예제 #4
0
                all_actions = self.env.act(obs)
                obs, reward, done, _ = self.env.step(all_actions)
                episode_steps += 1

                observations.append(obs)
                actions.append(all_actions)
                rewards.append(reward)
                dones.append(done)

            print('rollout %i/%i' % (i + 1, num_rollouts))
        return np.array(observations), np.array(
            to_categorical(
                actions,
                self.env.action_space.n)), np.array(rewards), np.array(dones)


# Instantiate the environment
config = ffa_competition_env()
env = Pomme(**config["env_kwargs"])

# Generate training data
stimulator = Stimulator(env, config)
observations, actions, rewards, dones = stimulator.stimulate(
    num_rollouts=initial_rollouts)

np.save(train_data_path + train_data_obs, observations)
np.save(train_data_path + train_data_labels, actions)
np.save(train_data_path + train_data_reward, rewards)
np.save(train_data_path + train_data_done, dones)

#print(np.sum(training_data_labels, axis=0) / np.sum(training_data_labels))
예제 #5
0
def main():
    # Print all possible environments in the Pommerman registry
    # Instantiate the environment
    DETERMINISTIC = False
    VISUALIZE = False

    if args.test:
        DETERMINISTIC = True
        VISUALIZE = True

    config = ffa_competition_env()
    env = Pomme(**config["env_kwargs"])
    env.seed(0)

    # Create a Proximal Policy Optimization agent
    with open('ppo.json', 'r') as fp:
            agent = json.load(fp=fp)

    with open('mlp2_lstm_network.json', 'r') as fp:
            network = json.load(fp=fp)

    agent = Agent.from_spec(
        spec=agent,
        kwargs=dict(
            states=dict(type='float', shape=env.observation_space.shape),
            actions=dict(type='int', num_actions=env.action_space.n),
            network=network
        )
    )

    # Add 3 random agents
    agents = []
    for agent_id in range(3):
        agents.append(SimpleAgent(config["agent"](agent_id, config["game_type"])))

    # Add TensorforceAgent
    agent_id += 1
    agents.append(TensorforceAgent(config["agent"](agent_id, config["game_type"])))
    env.set_agents(agents)
    env.set_training_agent(agents[-1].agent_id)
    env.set_init_game_state(None)

    # Instantiate and run the environment for 5 episodes.
    if VISUALIZE:
        wrapped_env = WrappedEnv(env, True)
    else:
        wrapped_env = WrappedEnv(env)

    runner = Runner(agent=agent, environment=wrapped_env)

    rewards = []
    episodes = []
    def episode_finished(r):
        nonlocal episodes
        nonlocal rewards
        print("Finished episode {ep} after {ts} timesteps (reward: {reward})".format(ep=r.episode, ts=r.episode_timestep,
                                                                             reward=r.episode_rewards[-1]))
        if r.episode % 1000 == 0:
            agent.save_model(('./{}').format(EXPERIMENT_NAME), False)
            try:
                prev_data = pickle.load(open(EXPERIMENT_NAME, "rb"))
                prev_len = len(prev_data[0])
                prev_data[0].extend(rewards)
                rewards = []
                prev_data[1].extend(episodes)
                episodes = []
                pickle.dump(prev_data, open(EXPERIMENT_NAME, "wb"))
            except (OSError, IOError) as e:
                pickle.dump([rewards, episodes], open(EXPERIMENT_NAME, "wb"))
        if r.episode_rewards[-1] >= 5:
            print()
            print()
            print()
            print("WINNER WINNER CHICKEN DINNER")
        episodes.append(r.episode)
        rewards.append(r.episode_rewards[-1])
        return True

    # Restore, Train, and Save Model
    if args.test or args.resume: # If test, change settings and restore model
        agent.restore_model('./','PPO_K_someS_500batch_biggerreward_99dis')
    runner.run(episodes=EPISODES, max_episode_timesteps=2000, episode_finished=episode_finished, deterministic=False)

    if not args.test:
        agent.save_model(('./{}').format(EXPERIMENT_NAME), False)
    print("Stats: ", runner.episode_rewards[-5:], runner.episode_timesteps[-5:])

    #Dump reward values
    try:
        prev_data = pickle.load(open(EXPERIMENT_NAME, "rb"))
        prev_len = len(prev_data[0])
        prev_data[0].extend(rewards)
        prev_data[1].extend(episodes)
        print(episodes)
        pickle.dump(prev_data, open(EXPERIMENT_NAME, "wb"))
    except (OSError, IOError) as e:
        pickle.dump([rewards, episodes], open(EXPERIMENT_NAME, "wb"))

    try:
        runner.close()
    except AttributeError as e:
        pass