Пример #1
0
def main():
    '''Simple function to bootstrap a game.
       
       Use this as an example to set up your training env.
    '''
    # Print all possible environments in the Pommerman registry
    # print(pommerman.REGISTRY)

    # Create a set of agents (exactly four)
    agent_list = [
        agents.PlayerAgent(),
        agents.RandomAgent(),
        agents.RandomAgent()
        # agents.DockerAgent("pommerman/simple-agent", port=12345),
    ]
    # Make the "Free-For-All" environment using the agent list
    env = pommerman.make('PommeFFACompetition-v0', agent_list)

    # Run the episodes just like OpenAI Gym
    for i_episode in range(1):
        state = env.reset()
        done = False
        while not done:
            env.render()
            actions = env.act(state)
            state, reward, done, info = env.step(actions)
        print('Episode {} finished'.format(i_episode))
    env.close()
def main():
    '''Simple function to bootstrap a game.
       
       Use this as an example to set up your training env.
    '''
    # Print all possible environments in the Pommerman registry
    print(pommerman.REGISTRY)

    # Create a set of agents (exactly four)
    agent_list = [
        agents.SimpleAgent(),
        agents.RandomAgent(),
        agents.RandomAgent(),
        #agents.DockerAgent("multiagentlearning/hakozakijunctions", port=12345),
        #agents.DockerAgent("multiagentlearning/eisenach", port=12345),
        agents.DockerAgent("multiagentlearning/skynet955", port=12345),
    ]
    # Make the "Free-For-All" environment using the agent list
    env = pommerman.make('PommeTeamCompetition-v1', agent_list)

    # Run the episodes just like OpenAI Gym
    for i_episode in range(1):
        state = env.reset()
        done = False
        while not done:
            env.render()
            actions = env.act(state)
            state, reward, done, info = env.step(actions)
        print('Episode {} finished'.format(i_episode))
        print("Final Result: ", info)
    env.close()
Пример #3
0
def main():
    '''Simple function to bootstrap a game.  '''

    # Print all possible environments in the Pommerman registry
    print(pommerman.REGISTRY)

    # Create a set of agents (exactly four)
    agent_list = [
        MyAgent(),
        agents.RandomAgent(),
        agents.SimpleAgent(),
        agents.RandomAgent(),
    ]

    # Make the "Free-For-All" environment using the agent list
    env = pommerman.make('PommeFFACompetition-v0', agent_list)

    # Run the episodes just like OpenAI Gym
    for i_episode in range(1):
        state = env.reset()
        done = False
        while not done:
            # This renders the game
            env.render()

            # This is where we give an action to the environment
            actions = env.act(state)

            # This performs the step and gives back the new information
            state, reward, done, info = env.step(actions)

        print('Episode: {:2d} finished'.format(i_episode))
    env.close()
Пример #4
0
    def run_game(self, env_name):
        # Create a set of agents (exactly four)
        agent_list = [
            agents.SimpleAgent(),
            agents.RandomAgent(),
            agents.SimpleAgent(),
            agents.RandomAgent(),
            # agents.DockerAgent("pommerman/simple-agent", port=12345),
        ]

        # Limit the agents for one vs one
        if 'oneVsOne' in env_name:
            agent_list = agent_list[:2]
        env = pommerman.make(env_name, agent_list)

        # Run the episodes just like OpenAI Gym
        for i_episode in range(1):
            state = env.reset()
            done = False
            while not done:
                # env.render()
                actions = env.act(state)
                state, reward, done, info = env.step(actions)
            print('Episode {} finished'.format(i_episode))
        env.close()
def main():
    '''Simple function to bootstrap a game.
       
       Use this as an example to set up your training env.
    '''
    # Print all possible environments in the Pommerman registry
    print(pommerman.REGISTRY)

    # Create a set of agents (exactly four)
    agent_list = [
        agents.SimpleAgent(),
        agents.RandomAgent(),
        agents.SimpleAgent(),
        agents.RandomAgent(),
        # agents.DockerAgent("pommerman/simple-agent", port=12345),
    ]
    # Make the "Free-For-All" environment using the agent list
    env = pommerman.make('PommeFFACompetition-v0', agent_list)

    # Run the episodes just like OpenAI Gym
    for i_episode in range(1):
        state = env.reset()
        done = False
        env.render()
        img = env._viewer.get_buffer().get_texture().get_image_data()
        while not done:
            env.render()
            actions = env.act(state)
            state, reward, done, info = env.step(actions)
            # data = img.get_data("RGB", img.width * 3)
            # arr = np.frombuffer(data, dtype=np.uint8)
            # reshaped_array = arr.reshape(img.width, img.height, 3)

        print('Episode {} finished'.format(i_episode))
    env.close()
Пример #6
0
def main():
    '''Simple function to bootstrap a game.
       
       Use this as an example to set up your training env.
    '''
    # Print all possible environments in the Pommerman registry
    print(pommerman.REGISTRY)

    # Create a set of agents (exactly four)

    shape = (14, 11, 11)
    n_actions = 6
    n_filters_per_layer = 64
    n_cnn_layers = 4
    nn_model = CNNBatchNorm(input_feature_shape=shape,
                            n_actions=n_actions,
                            n_filters_per_layer=n_filters_per_layer,
                            n_cnn_layers=n_cnn_layers)
    nn_path = './output/NN_MODELS/ppo_CNN4_64_199.pt'  #CHANGE THIS to your actual checkpoint file. Currently assumes you are calling the script from main dir
    nn_model.load_state_dict(
        torch.load(nn_path, map_location=lambda storage, loc: storage))
    selection = 'softmax'
    nn_agent = NNAgent(nn_model, action_selection=selection, is_training=False)
    nn_agent2 = NNAgent(nn_model,
                        action_selection=selection,
                        is_training=False)

    idx = 0
    team_id = (idx + 2) % 4
    #env_id="PommeFFACompetition-v0"
    #env_id="PommeTeamCompetition-v0"
    env_id = "SimpleTeam-v0"
    agent_list = [
        agents.RandomAgent(),
        agents.SlowRandomAgentNoBomb(),
        agents.RandomAgent(),
        agents.SlowRandomAgentNoBomb(),
        #agents.PlayerAgent(),
        #agents.RandomAgent(),
    ]
    agent_list[idx] = nn_agent
    agent_list[team_id] = nn_agent2
    # Make the environment using the agent list
    env = pommerman.make(env_id, agent_list)

    # Run the episodes just like OpenAI Gym
    for i_episode in range(1):
        state = env.reset()
        done = False
        while not done:
            env.render()
            actions = env.act(state)
            #a=nn_agent.act(state[idx], env.action_space, 'softmax') if nn_agent.is_alive else 0
            #actions[idx]=a
            #print('actions', actions, 'nn alive', nn_agent.is_alive)
            state, reward, done, info = env.step(actions)
            #if nn_agent.is_alive ==False: print('dead')
        print('Episode {} finished'.format(i_episode))
        print("Final Result: ", info)
    env.close()
Пример #7
0
def main():
    # Print all possible environments in the Pommerman registry
    print(pommerman.registry)

    # Create a set of agents (exactly four)
    agent_list = [
        agents.SimpleAgent(),
        agents.RandomAgent(),
        agents.SimpleAgent(),
        agents.RandomAgent(),
        # agents.DockerAgent("pommerman/simple-agent", port=12345),
    ]
    # Make the "Free-For-All" environment using the agent list
    env = pommerman.make('PommeFFA-v0', agent_list)

    # Run the episodes just like OpenAI Gym
    for i_episode in range(1):
        state = env.reset()
        done = False
        while not done:
            env.render()
            actions = env.act(state)
            state, reward, done, info = env.step(actions)
        print('Episode {} finished'.format(i_episode))
    env.close()
Пример #8
0
def main():
    opponents = [
        agents.SimpleAgent(),
        agents.RandomAgent(),
        agents.RandomAgent(),
    ]

    _train(opponents, train_from_scratch=True)
    _test(opponents, 100, render=False)
def main():
    '''Simple function to bootstrap a game.
       
       Use this as an example to set up your training env.
    '''
    # Print all possible environments in the Pommerman registry
    print(pommerman.REGISTRY)

    # Create a set of agents (exactly four)
    lstm_nn_model = CNN_LSTM(input_feature_shape=(9, 11, 11),
                             n_actions=6,
                             n_filters_per_layer=64,
                             n_cnn_layers=6)
    lstm_nn_model.load_state_dict(
        torch.load(
            '/home/cgao3/pommerman/pommerman/agents/LOGS/ppo_cnn_lstm_cnn_6_64_27.pt',
            map_location=lambda storage, loc: storage))
    # torch.load('my_file.pt', map_location=lambda storage, loc: storage) #for map CUDA pt to CPU

    nn_agent = NNAgent(lstm_nn_model)
    nn_agent2 = NNAgent(lstm_nn_model)
    idx = 0
    team_id = (idx + 2) % 4
    #env_id="PommeFFACompetition-v0"
    env_id = "PommeTeamCompetition-v0"
    agent_list = [
        #nn_agent,
        agents.RandomAgent(),
        agents.SimpleAgent(),
        agents.RandomAgent(),
        #agents.SimpleAgent(),
        agents.RandomAgent(),
        # agents.DockerAgent("pommerman/simple-agent", port=12345),
    ]
    agent_list[idx] = nn_agent
    agent_list[team_id] = nn_agent2
    # Make the "Free-For-All" environment using the agent list
    env = pommerman.make(env_id, agent_list)

    # Run the episodes just like OpenAI Gym
    for i_episode in range(20):
        state = env.reset()
        done = False
        while not done:
            #env.render()
            actions = env.act(state)
            #a=nn_agent.act(state[idx], env.action_space, 'softmax') if nn_agent.is_alive else 0
            #actions[idx]=a
            print('actions', actions, 'nn alive', nn_agent.is_alive)
            state, reward, done, info = env.step(actions)
            #if nn_agent.is_alive ==False: print('dead')
        print('Episode {} finished'.format(i_episode))
    env.close()
Пример #10
0
    def __init__(self, config=None):
        #self._observation_spec = ['board', 'bomb_blast_strength', 'bomb_life', 'position', 'ammo', 'blast_strength', 'can_kick', 'teammate', 'enemies', 'message']

        self.adversarial = config['adversarial']

        # Create a set of agents (exactly four)
        agent_list = [
            agents.RandomAgent(),
            agents.RandomAgent(),
            agents.RandomAgent(),
            agents.RandomAgent(),
        ]

        # Make the "Free-For-All" environment using the agent list
        self._env = pommerman.make('PommeRadioCompetition-v2', agent_list)
Пример #11
0
def main():
    """Simple function to bootstrap a game"""
    # Print all possible environments in the Pommerman registry
    print(pommerman.REGISTRY)

    # Create a set of agents (exactly four)
    agent_list = [
        agents.SimpleAgent(),
        agents.RandomAgent(),
        agents.SimpleAgent(),
        agents.HttpAgent(port=10080, host="localhost"),
    ]
    # Make the "Free-For-All" environment using the agent list
    env = pommerman.make('PommeFFACompetition-v0', agent_list)

    # Run the episodes just like OpenAI Gym
    for i_episode in range(1):
        state = env.reset()
        done = False
        while not done:
            env.render()
            actions = env.act(state)
            state, reward, done, info = env.step(actions)
        print('Episode {} finished'.format(i_episode))
    env.close()
Пример #12
0
 def __init__(self,
              env_id,
              random_side=True,
              agent_list=None,
              rule_agents=[],
              replay_dir=None,
              n_player=4):
     self.n_player = n_player
     self.base_agents = [agents.RandomAgent() for _ in range(n_player)]
     if agent_list is None:
         self.agent_list = self.base_agents
     else:
         assert isinstance(agent_list, str)
         agent_list = agent_list.split(',')
         assert len(agent_list) == n_player
         self.agent_list = [
             helpers.make_agent_from_string(agent, i)
             for i, agent in enumerate(agent_list)
         ]
     # Make the environment using the agent list
     env = pommerman.make(env_id, self.agent_list)
     if agent_list is not None:
         for id_, agent in enumerate(self.base_agents):
             agent.init_agent(id_, env.spec._kwargs['game_type'])
     super(PommeBase, self).__init__(env)
     self.rule_agents = rule_agents
     self._random_side = random_side
     self.random_side()
     self._uuid = str(uuid.uuid1())[:8]
     self._replay_dir = replay_dir
     self._replay_data = {"mode": str(env_id)}
Пример #13
0
def main():
    # Instantiate the environment
    agent_list = [
        agents.SimpleAgent(),
        agents.SimpleAgent(),
        agents.RandomAgent(),
        ddpg_agent,
        # agents.DockerAgent("pommerman/simple-agent", port=12345),
    ]
    env = pommerman.make(args.env_name, agent_list)
    env.seed(RANDOM_SEED)
    # Random seed
    agent_num = 0
    env = EnvWrapper(env, num_agent=agent_num)

    # Generate training data
    stimulator = save_episodes(env)
    stimulator.stimulate()

    observations = []
    actions = []
    rewards = []
    for episode in stimulator.episodes:
        observations.append(episode.observations)
        actions.append(episode.actions)
        rewards.append(episode.reward)

    observations_merged = np.concatenate(observations)
    actions_merged = np.concatenate(actions)
    rewards_merged = np.concatenate(rewards)

    np.save(train_data_obs, observations_merged)
    np.save(train_data_labels, actions_merged)
    np.save(train_data_reward, rewards_merged)
Пример #14
0
def main():
    '''Simple function to bootstrap a game.
       
       Use this as an example to set up your training env.
    '''
    # Print all possible environments in the Pommerman registry
    print(pommerman.REGISTRY)

    # Generate a json every 5 episodes
    json_check = 5

    # Create a set of agents (exactly four)
    agent_list = [
        agents.SimpleAgent(),
        agents.RandomAgent(),
        agents.SimpleAgent(),
        agents.RandomAgent(),
    ]
    deep_agents = 'test::agents.SimpleAgent,test::agents.RandomAgent,test::agents.RandomAgent,test::agents.SimpleAgent'
    #agents.DockerAgent("multiagentlearning/hakozakijunctions", port=12345),
    #agents.DockerAgent("multiagentlearning/eisenach", port=12345),
    #agents.DockerAgent("multiagentlearning/skynet955", port=12345),

    # Make the "Free-For-All" environment using the agent list
    config = 'PommeFFACompetition-v0'
    #config = 'PommeTeamCompetition-v1'
    env = pommerman.make(config, agent_list)

    # Run the episodes just like OpenAI Gym
    for i_episode in range(20):
        if i_episode % json_check == 0:
            fight.run(
                config,
                deep_agents,
                record_json_dir="test_json/test_json" + str(i_episode)
            )  # GIVES ME ERROR DURING env.save_json for anything except FFA
        else:
            state = env.reset()
            done = False
            while not done:
                actions = env.act(state)
                state, reward, done, info = env.step(actions)

            print('Episode {} finished'.format(i_episode))
            print("Final Result: ", info)

    env.close()
Пример #15
0
 def _thunk():
     agent_list = [
         # agents.SimpleAgent(),
         agents.RandomAgent(),
         agents.BaseAgent(),
         agents.SimpleAgent(),
         agents.SimpleAgent()
     ]
     env = pommerman.make(env_id, agent_list)
     return env
Пример #16
0
def main():
    tf.reset_default_graph()
    # Print all possible environments in the Pommerman registry
    # print(pommerman.registry)
    sess = tf.Session()
    # sess.run(tf.global_variables_initializer())
    # sess = tf_debug.TensorBoardDebugWrapperSession(sess, 'localhost:6064')

    # Create a set of agents (exactly four)
    ddpg_agent = DdpgAgent(id=3, sess=sess)
    agent_list = [
        agents.SimpleAgent(),
        agents.SimpleAgent(),
        agents.RandomAgent(),
        ddpg_agent,
        # agents.DockerAgent("pommerman/simple-agent", port=12345),
    ]
    env = pommerman.make(args.env_name, agent_list)
    env.seed(RANDOM_SEED)

    print('HERE0', sess)
    ddpg_agent.train_transformer(sess, env)
    print('her2')
    print(9 / 0)
    r_sum = np.zeros(1)

    for i in range(args.num_steps):
        # Make the "Free-For-All" environment using the agent list
        env.reset()
        # Run the episodes just like OpenAI Gym

        for i_episode in range(args.max_episode_length):
            state = env.reset()

            done = False
            while not done:

                # if args.display:
                #     env.render()

                actions = env.act(state)
                state, reward, done, info = env.step(actions)
                r_sum[i] += reward[0]

            if i_episode > 300:
                break

        print('Game {} finished'.format(i))

    np.savetxt(args.outdir + '/result_2simple_2random.csv', r_sum, fmt='%1.4e')
    env.close()
Пример #17
0
def generate_data(EPISODES, save_file_nm, shuffle_agents=False):
    rnn_agent = RNN_Agent()

    # Init dataset
    dset = dataset(rnn_agent.RNN_SEQUENCE_LENGTH, save_file_nm,
                   rnn_agent.utils)
    if os.path.exists(save_file_nm): dset.load()

    agent_list = [
        rnn_agent,
        agents.SimpleAgent(),
        agents.RandomAgent(),
        agents.SimpleAgent()
    ]
    rnn_agent_index = agent_list.index(rnn_agent)

    if shuffle_agents: shuffle(agent_list)
    env = pommerman.make('PommeFFACompetition-v0', agent_list)

    wins = {}
    iter_num = 0
    for an_episode in range(EPISODES):

        state = env.reset()

        #-------------------------------------------------------------------
        done = False
        episode_obs = []
        episode_acts = []
        #while not done and rnn_agent.is_alive:
        while not done:
            #env.render()
            actions = env.act(state)
            episode_acts.append(actions[rnn_agent_index])
            episode_obs.append(rnn_agent.utils.input(state[rnn_agent_index]))
            state, reward, done, info = env.step(actions)

            iter_num += 1
        #-------------------------------------------------------------------

        # Final timestep observation
        episode_obs.append(rnn_agent.utils.input(state[rnn_agent_index]))
        dset.add_episode(episode_obs, episode_acts)

        #print(info)
    #print("Median Act Time: {} seconds".format(np.median(np.array(rnn_agent.act_times))))

    env.close()
    dset.save()
    rnn_agent.sess.close()
    tf.reset_default_graph()
Пример #18
0
    def load_game(self):
        self.hide()
        with open('./replay/000.pickle', 'rb') as f:
            replay_game = pickle.load(f)
        num_players = replay_game.pop()
        agents_list = [agents.RandomAgent() for i in range(num_players)]

        env = pommerman.make('PommeFFACompetition-v0', agents_list,
                             game_state_file='./replay/000.json')

        # Run the episodes just like OpenAI Gym
        env.reset()
        for actions in replay_game:
            env.render()
            env.step(actions)
        env.close()
        self.show()
def main():
    env = pommerman.make('PommeFFACompetition-v0', [
        agents.PlayerAgent(),
        agents.SimpleAgent(),
        agents.RandomAgent(),
        StoppingAgent(),
    ])

    for i_episode in range(1):
        state = env.reset()
        done = False
        while not done:
            env.render()
            actions = env.act(state)
            state, reward, done, info = env.step(actions)
            if done:
                win_player = info['winners'][0] + 1
                print(f'win {win_player}P')
        print(f'Episode {i_episode} finished')
    env.close()
Пример #20
0
def main():

    # Print all possible environments in the Pommerman registry
    print(pommerman.registry)
    # sess = tf.Session()
    #sess = tf_debug.TensorBoardDebugWrapperSession(sess, 'localhost:6064')

    # Create a set of agents (exactly four)
    agent_list = [
        agents.SimpleAgent(),
        agents.SimpleAgent(),
        agents.RandomAgent(),
        agents.RandomAgent(),
        # agents.DockerAgent("pommerman/simple-agent", port=12345),
    ]
    env = pommerman.make('PommeFFACompetition-v0', agent_list)

    # Create the Estimator
    estimator_nn1 = tf.estimator.Estimator(model_fn=model_NN1,
                                           model_dir="/tmp/sa_nn1")
    # Set up logging for predictions
    tensors_to_logNN1 = {"probabilities": "softmax_tensor"}
    logging_hook_nn1 = tf.train.LoggingTensorHook(tensors=tensors_to_logNN1,
                                                  every_n_iter=50)

    # Create the Estimator
    estimator_nn2 = tf.estimator.Estimator(model_fn=model_NN2,
                                           model_dir="/tmp/sa_nn2")
    # Set up logging for predictions
    tensors_to_logNN2 = {"probabilities": "softmax_tensor"}
    logging_hook_nn2 = tf.train.LoggingTensorHook(tensors=tensors_to_logNN2,
                                                  every_n_iter=50)

    r_sum = np.zeros(1)
    for i in range(1):
        # Make the "Free-For-All" environment using the agent list
        env.reset()
        # Run the episodes just like OpenAI Gym
        for i_episode in range(1):
            state = env.reset()

            done = False
            curr_state = None
            prev_state = None
            graph = np.random.rand(4, 30).astype("float32") + 0.0001
            #         print(graph)
            pr_action = None
            pr_pr_action = None

            while not done:
                #             env.render()
                actions = env.act(state)
                state, reward, done, info = env.step(actions)
                r_sum[i] += reward[0]

                # as basic implementation I consider only one agent
                prev_state = curr_state
                curr_state = state

                if pr_pr_action is not None:
                    # Train the model
                    for agent_num in range(4):
                        train_input_NN2 = tf.estimator.inputs.numpy_input_fn(
                            x={"state1": np.resize(
                                    state_to_matrix_with_action(curr_state[agent_num], action=pr_action[agent_num])\
                                    .astype("float32"), (1, 49*11)),
                               "graph": np.resize(graph, (1, 4*30))
                              },
                            y=np.asarray([actions[agent_num]]),
                            batch_size=1,
                            num_epochs=None,
                            shuffle=True)

                        train_input_NN1 = tf.estimator.inputs.numpy_input_fn(
                            x={"state1": np.resize(
                                    state_to_matrix_with_action(prev_state[agent_num], action=pr_pr_action[agent_num])\
                                    .astype("float32"), (1, 49 * 11)),
                               "state2": np.resize(
                                    state_to_matrix_with_action(curr_state[agent_num], action=pr_action[agent_num])\
                                   .astype("float32"), (1, 49 * 11))},
                            y=np.asmatrix(graph.flatten()),
                            batch_size=1,
                            num_epochs=None,
                            shuffle=True)

                        estimator_nn1.train(input_fn=train_input_NN1,
                                            steps=200,
                                            hooks=[logging_hook_nn1])

                        estimator_nn2.train(input_fn=train_input_NN2,
                                            steps=200,
                                            hooks=[logging_hook_nn2])
                        predictions = estimator_nn2.predict(
                            input_fn=train_input_NN2)
                        #next_action = np.array(list(p['classes'] for p in predictions))
                pr_pr_action = pr_action
                pr_action = actions
            if i_episode > 300:
                break
        print('Game {} finished'.format(i))

    np.savetxt('result_2simple_2random.csv', r_sum, fmt='%1.4e')
    env.close()
Пример #21
0
def train_C_generate_data(EPISODES,
                          save_file_nm,
                          chk_point_folder,
                          sess_save_step=100,
                          load_model=None,
                          shuffle_agents=False,
                          record=False,
                          plot_reward=False,
                          add_agents=[
                              agents.SimpleAgent(),
                              agents.RandomAgent(),
                              agents.SimpleAgent()
                          ],
                          encourage_win=False,
                          learn=True):
    if plot_reward:
        plt.xlabel('Episode #')
        plt.ylabel('Average reward for last 100 episodes')
    # Init the agent
    rnn_agent = RNN_Agent(model_training='C')

    # For saving model
    saver = tf.train.Saver()

    if not os.path.exists(chk_point_folder):
        os.makedirs(chk_point_folder)
    # Try to recover previous model
    if load_model is not None: load_folder = load_model
    else: load_folder = chk_point_folder
    latest_model = tf.train.latest_checkpoint(load_folder)
    if latest_model is not None:
        saver.restore(rnn_agent.sess, latest_model)
        print("Restored ", latest_model)

    # Init dataset
    if record:
        dset = dataset(rnn_agent.RNN_SEQUENCE_LENGTH, save_file_nm,
                       rnn_agent.utils)
        if os.path.exists(save_file_nm): dset.load()

    # TensorBoard writer
    experimentFolder = datetime.now().isoformat(timespec='minutes')
    C_writer = tf.summary.FileWriter(
        './tboard/train_C_{}_{}'.format(
            save_file_nm.split('.')[0], experimentFolder),
        rnn_agent.sess.graph)
    rnn_agent.summary_writer = C_writer

    agent_list = [rnn_agent] + add_agents

    if shuffle_agents: shuffle(agent_list)
    rnn_agent_index = agent_list.index(rnn_agent)

    env = pommerman.make('PommeFFACompetition-v0', agent_list)

    mean_rewards_list = []
    episode_history = deque(maxlen=100)
    ties = deque(maxlen=100)

    rnn_wins = deque(maxlen=100)
    other_wins = deque(maxlen=100)

    for i_episode in range(EPISODES):
        # initialize
        state = env.reset()
        prev_state = np.copy(state)
        total_rewards = 0

        #-------------------------------------------------------------------
        done = False
        episode_obs = []
        episode_acts = []
        #while not done and rnn_agent.is_alive:
        t = 0
        wins = {}
        while not done and rnn_agent.is_alive:
            t += 1
            #env.render()
            actions = env.act(state)

            episode_acts.append(actions[rnn_agent_index])
            episode_obs.append(rnn_agent.utils.input(state[rnn_agent_index]))

            state, reward, done, info = env.step(actions)
            if not encourage_win:
                reward[rnn_agent_index] = reward[
                    rnn_agent_index] if not rnn_agent.is_alive else 0.1
            else:
                reward[rnn_agent_index] = reward[
                    rnn_agent_index] if not rnn_agent.is_alive else 0.09
            if encourage_win and done and 'winners' in info:
                reward[rnn_agent_index] = 5 if info['winners'][
                    0] == rnn_agent_index else -5
            #print("t: {} \t reward: {}\t Agent alive: {}".format(t, reward[rnn_agent_index], rnn_agent.is_alive) )

            total_rewards += reward[rnn_agent_index]
            rnn_agent.storeRollout(
                np.concatenate(
                    (rnn_agent.utils.input(prev_state[rnn_agent_index]),
                     rnn_agent.rnn_state)), actions[rnn_agent_index],
                reward[rnn_agent_index])
            prev_state = np.copy(state)
        #-------------------------------------------------------------------
        if 'winners' in info:
            rnn_wins.append(1 if info['winners'][0] == rnn_agent_index else 0)
            other_wins.append(
                1 if info['winners'][0] != rnn_agent_index else 0)
        wins_ratio = np.mean(other_wins) / np.mean(rnn_wins)
        tflog('Other wins/agent wins ratio (100 wins)', wins_ratio)
        #print('Other wins/agent wins ratio (100 wins)',  wins_ratio)

        ties.append(1 if 'Tie' in info else 0)
        tie_ratio = np.mean(ties) / np.mean(rnn_wins)
        #tflog('ties/agent wins ratio (100 steps)',  tie_ratio)

        # Final timestep observation
        episode_obs.append(rnn_agent.utils.input(state[rnn_agent_index]))
        if record: dset.add_episode(episode_obs, episode_acts)

        rnn_agent.update_C()

        episode_history.append(total_rewards)
        mean_rewards = np.mean(episode_history)

        print("Episode {}".format(i_episode))
        print("Finished after {} timesteps".format(t + 1))
        print("Reward for this episode: {}".format(total_rewards))
        print("Average reward for last 100 episodes: {:.2f}".format(
            mean_rewards))
        mean_rewards_list.append(mean_rewards)
        #tflog('Iteration Number',  rnn_agent.train_iteration)
        tflog('Average reward for last 100 episodes', mean_rewards)

        # Save the model
        if i_episode % sess_save_step == 0:
            if learn:
                saver.save(rnn_agent.sess,
                           chk_point_folder,
                           global_step=rnn_agent.C_step)
            if record: dset.save()

        # Plot rewards
        if plot_reward:
            x = np.arange(i_episode + 1)
            # Linear Reg
            fit = np.polyfit(x, mean_rewards_list, 1)
            fit_fn = np.poly1d(fit)

            plt.plot(x, mean_rewards_list, '.', x, fit_fn(x), '--k')
            plt.savefig("test.png")
            plt.gcf().clear()
        #print(info)
    print("Median Act Time: {} seconds".format(
        np.median(np.array(rnn_agent.act_times))))

    env.close()
    rnn_agent.sess.close()
    tf.reset_default_graph()
def main():
    if platform.system() == 'Darwin':
        os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'

    process = psutil.Process(os.getpid())

    # MARK: - Create the environment
    agent_list = [
        agents.RandomAgent(),
        agents.SimpleAgent(),
        # agents.SimpleAgent(),
        # agents.SimpleAgent(),
    ]

    env = pommerman.make('OneVsOne-v0', agent_list, render_mode='human')

    # MARK: - Allowing to save the model
    now = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    checkpoint_path = os.path.join(".", "models", now, "-{epoch:04d}.ckpt")
    # MARK: - Log for tensorboard
    log_dir = os.path.join(
        "logs",
        now,
    )
    tensorflow_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir,
                                                         profile_batch=5,
                                                         histogram_freq=1)
    file_writer_rewards = tf.summary.create_file_writer(log_dir + "/metrics")

    # =========== (HYPER)PARAMETERS AND VARIABLES =========== #
    LIST_SIZE = 10000
    D = deque(maxlen=LIST_SIZE)
    DISCOUNT_RATE = 0.8
    TAU = 0
    MAX_TAU = 1000
    ACTION_SPACE = env.action_space.n
    TIME_CHANNELS_SIZE = 1
    INPUT_SHAPE = list(env.get_observation_space()) + [TIME_CHANNELS_SIZE]
    BATCH_SIZE = 32
    N = BATCH_SIZE
    N_EPISODES = 1000
    EXPLORATION_BASE = 1.02
    EXPLORATION_RATE = 1
    MINIMAL_EXPLORATION_RATE = 0.01
    TD_ERROR_DEFAULT = 0
    FRAME_COUNT = 0
    print(f"Pixel space of the game {INPUT_SHAPE}")

    # ================== CONTINUE TRAIN FROM LOADED MODEL ==================== #
    # approximator_model = create_model(INPUT_SHAPE, ACTION_SPACE)
    # target_model = create_model(INPUT_SHAPE, ACTION_SPACE)
    #
    # MODEL_PATH = "models/20200119-121818"
    # latest = tf.train.latest_checkpoint(MODEL_PATH)
    # print(f"Loading model from {latest}")
    #
    # approximator_model.load_weights(latest)
    # target_model.load_weights(latest)
    # ======================================================================== #

    # =================== START WITH NEW MODEL =============================== #
    approximator_model = create_model(INPUT_SHAPE, ACTION_SPACE)
    target_model = create_model(INPUT_SHAPE, ACTION_SPACE)
    # ======================================================================== #

    # ===== INITIALISATION ======
    acc_nonzeros = []
    actions_available = [
        str(action).split(".")[1] for action in constants.Action
    ]

    print("Running the init")
    for n in range(N):
        if FRAME_COUNT > BATCH_SIZE:
            break
        state_obs = env.reset()
        done = False
        while not done:
            FRAME_COUNT += 1
            actions_all_agents = env.act(state_obs)
            state_obs, reward, done, info, pixels = env.step2(
                actions_all_agents)

            D.append([
                preprocess(pixels), reward[0], actions_all_agents[0],
                reward[0], done
            ])
        print('Init episode {} finished'.format(n))

    for episode in range(N_EPISODES):
        start_time = time.time()

        if TAU >= MAX_TAU:
            TAU = 0
            # Copy the weights from policy model to target model
            target_model.set_weights(approximator_model.get_weights())
            print("===> Updated weights")

        EXPLORATION_RATE = np.power(
            EXPLORATION_BASE, -episode
        ) if EXPLORATION_RATE > MINIMAL_EXPLORATION_RATE else MINIMAL_EXPLORATION_RATE
        # EXPLORATION_RATE = 1 - (episode * 1 / N_EPISODES) if EXPLORATION_RATE > MINIMAL_EXPLORATION_RATE else MINIMAL_EXPLORATION_RATE

        print(
            f"Running episode {episode} with exploration rate: {EXPLORATION_RATE}"
        )

        # Intial step for the episode
        state_obs = env.reset()
        actions = env.act(state_obs)
        initial_observation, reward, done, info, pixels = env.step2(
            actions, render=True)

        state = preprocess(pixels)

        done = False

        # next_state = initial_state.copy()  # To remove all the information of the last episode

        episode_rewards = []
        frame_cnt = 0
        acc_qs = []
        acc_actions = []
        acc_frames = []
        action_str = ""

        while not done:
            # https://danieltakeshi.github.io/2016/11/25/frame-skipping-and-preprocessing-for-deep-q-networks-on-atari-2600-games/
            frame_cnt += 1
            TAU += 1

            actions_all_agents = env.act(state_obs)
            action = actions_all_agents[0]
            q_values = np.zeros((1, ACTION_SPACE))
            is_explore = random.choices(
                (True, False), (EXPLORATION_RATE, 1 - EXPLORATION_RATE))[0]
            if not is_explore:
                # Greedy action
                init_mask = tf.ones([1, ACTION_SPACE])
                init_state = state
                q_values = approximator_model.predict(
                    [tf.reshape(init_state, [1] + INPUT_SHAPE), init_mask])
                action = np.argmax(q_values)
                actions_all_agents[0] = action

            acc_qs.append(q_values[0])
            # print(
            #     action_str) if action_str != f"Action taken: {actions_available[action]}" else None

            state_obs, reward, done, info, pixels = env.step2(
                actions_all_agents)

            flipped = np.flip(pixels, (0))
            acc_frames.append(flipped)
            acc_actions.append(action)

            episode_rewards.append(reward[0])
            state = preprocess(pixels)
            D.append([
                state, reward[0], actions_all_agents[0], TD_ERROR_DEFAULT, done
            ])
            if (episode + 1) % 5 == 0:
                action_str = f"Action taken: {actions_available[action]} was {'greedy' if not is_explore else 'explored'}"
                print(action_str)

        memory_length = len(D)
        print(f"Number of frames in memory {memory_length}")
        # experience_batch = take_sample(D, approximator_model, target_model, BATCH_SIZE, ACTION_SPACE)
        ids, importance, max_td_err = take_sample(D,
                                                  BATCH_SIZE,
                                                  beta=1 -
                                                  (episode / N_EPISODES))
        TD_ERROR_DEFAULT = max_td_err
        experience_batch = [(D[idx],
                             D[idx + 1]) if idx < memory_length - 1 else
                            (D[idx - 1], D[idx]) for idx in ids]

        set_of_batch_states = tf.constant(
            [exp[0][0] for exp in experience_batch])
        set_of_batch_next_states = tf.constant(
            [exp[1][0] for exp in experience_batch])

        # Gather actions for each batch item
        set_of_batch_actions = tf.one_hot(
            [exp[0][2] for exp in experience_batch], ACTION_SPACE)

        # Maybe unnecessary - We are using the double q mask instead.
        next_q_mask = tf.ones([BATCH_SIZE, ACTION_SPACE])

        set_of_batch_states = tf.cast(tf.reshape(
            set_of_batch_states, set_of_batch_states.shape + [1]),
                                      dtype=tf.float32)
        double_q_mask = tf.one_hot(
            tf.argmax(approximator_model.predict(
                [set_of_batch_states, next_q_mask]),
                      axis=1), ACTION_SPACE)  # http://arxiv.org/abs/1509.06461

        set_of_batch_next_states = tf.cast(tf.reshape(
            set_of_batch_next_states, set_of_batch_next_states.shape + [1]),
                                           dtype=tf.float32)
        next_q_values = tf.constant(
            target_model.predict([set_of_batch_next_states, double_q_mask]))

        # Gather rewards for each batch item
        set_of_batch_rewards = tf.constant(
            [exp[0][1] for exp in experience_batch], dtype=next_q_values.dtype)
        episode_nonzero_reward_states = (
            tf.math.count_nonzero(set_of_batch_rewards) / BATCH_SIZE) * 100
        print(
            f"Number of information yielding states: {episode_nonzero_reward_states}"
        )

        is_terminal = tf.constant(
            [0 if exp[1][4] else 1 for exp in experience_batch],
            dtype=next_q_values.dtype)
        next_q = set_of_batch_rewards + \
            (DISCOUNT_RATE * tf.reduce_max(next_q_values, axis=1)) * is_terminal
        init_q_values = approximator_model.predict(
            [set_of_batch_states, set_of_batch_actions])
        init_q = tf.reduce_sum(init_q_values, axis=1)
        td_error = (next_q - init_q).numpy()

        history = approximator_model.fit(
            [set_of_batch_states, set_of_batch_actions],
            next_q,
            batch_size=BATCH_SIZE,
            verbose=1,
            callbacks=[tensorflow_callback],
            sample_weight=importance)

        for idx, exp in enumerate(experience_batch):
            exp[0][3] = td_error[idx]

        # Wrap up
        loss = history.history.get("loss", [0])[0]
        time_end = np.round(time.time() - start_time, 2)
        memory_usage = process.memory_info().rss
        print(f"Current memory consumption is {memory_usage}")
        print(
            f"Loss of episode {episode} is {loss} and took {time_end} seconds")
        random_experience_idx = random.choice(range(len(experience_batch) - 1))
        random_experience = experience_batch[random_experience_idx][0]
        random_experience_next = experience_batch[random_experience_idx][1]

        # print(tmp.shape)
        episode_image = plot_to_image(
            image_grid_pommerman(random_experience, random_experience_next,
                                 [action for action in constants.Action]))
        image_qs = utils.plot_to_image(
            utils.plot_q(np.array(acc_qs),
                         [action for action in constants.Action]))
        image_pommerman = utils.plot_to_image(
            utils.show_pommerman_game(acc_frames, acc_actions,
                                      [action for action in constants.Action]))
        with file_writer_rewards.as_default():
            tf.summary.scalar('episode_rewards',
                              np.sum(episode_rewards),
                              step=episode)
            tf.summary.scalar('episode_loss', loss, step=episode)
            tf.summary.scalar('episode_time_in_secs', time_end, step=episode)
            tf.summary.scalar('episode_nr_frames', frame_cnt, step=episode)
            tf.summary.scalar('episode_exploration_rate',
                              EXPLORATION_RATE,
                              step=episode)
            tf.summary.scalar('episode_mem_usage', memory_usage, step=episode)
            tf.summary.scalar('episode_frames_per_sec',
                              np.round(frame_cnt / time_end, 2),
                              step=episode)
            tf.summary.histogram('q-values', next_q_values, step=episode)
            tf.summary.image('q-values-over-time', image_qs, step=episode)
            tf.summary.image('pommerman-game', image_pommerman, step=episode)

            tf.summary.scalar('episode_mem_usage_in_GB',
                              np.round(memory_usage / 1024 / 1024 / 1024),
                              step=episode)
            tf.summary.image('episode_example_state',
                             episode_image,
                             step=episode)
            if (episode + 1) % 5 == 0:
                acc_nonzeros.append(episode_nonzero_reward_states)
                tf.summary.histogram('episode_nonzero_reward_states',
                                     acc_nonzeros,
                                     step=(episode + 1) // 5)
            else:
                acc_nonzeros.append(episode_nonzero_reward_states)
        if (episode + 1) % 50 == 0:
            model_target_dir = checkpoint_path.format(epoch=episode)
            approximator_model.save_weights(model_target_dir)
            print(f"Model was saved under {model_target_dir}")
Пример #23
0
    def run(self):
        # If we move this to "init", we get an error on recursion depth
        self.A3CAgent = A3CAgent(self.lnet)
        self.agentList = [
            self.A3CAgent,
            agents.SimpleAgent(),
            agents.RandomAgent(),
            agents.RandomAgent()
        ]
        self.env = env = pommerman.make('PommeFFACompetition-v0',
                                        self.agentList)

        total_step = 1
        while self.g_ep.value < MAX_EP:
            # Step 2). worker interacts with environment
            s_act = self.env.reset()
            max_ammo = old_max_ammo = 1
            ep_r = 0.
            self.render = False  # self.g_ep.value % 20==0
            self.A3CAgent.reset_lstm()
            if self.name == 'w0':
                enc1 = abs(torch.sum(self.gnet.encoder1.weight.data).item())
                enc2 = abs(torch.sum(self.gnet.encoder2.weight.data).item())
                enc3 = abs(torch.sum(self.gnet.encoder3.weight.data).item())
                conv1 = abs(torch.sum(self.gnet.conv1.weight.data).item())
                conv2 = abs(torch.sum(self.gnet.conv2.weight.data).item())
                conv3 = abs(torch.sum(self.gnet.conv3.weight.data).item())
                conv4 = abs(torch.sum(self.gnet.conv4.weight.data).item())
                cl = abs(torch.sum(self.gnet.critic_linear.weight.data).item())
                alstm1 = abs(
                    torch.sum(self.gnet.actor_lstm.weight_ih_l0.data).item())
                alstm2 = abs(
                    torch.sum(self.gnet.actor_lstm.weight_hh_l0.data).item())
                aout = abs(torch.sum(self.gnet.actor_out.weight.data).item())
                f = open("AbsSummedWeights_ActorCritic_v2.txt", "a")
                f.write(
                    '{0:.5f} \t {1:.5f} \t {2:.5f} \t {3:.5f} \t {4:.5f} \t {5:.5f} \t {6:.5f} \t {7:.5f} \t {8:.5f} '
                    '\t {9:.5f} \t {10:.5f} \n'.format(enc1, enc2, enc3, conv1,
                                                       conv2, conv3, conv4,
                                                       alstm1, alstm2, aout,
                                                       cl))
                f.close()
            while True:
                # only render worker 0
                if self.name == 'w0' and self.render:
                    self.env.render()

                agent_actions = self.env.act(s_act)

                a = agent_actions[self.agent_nr]
                self.saved_oh_actions[:, :
                                      -1] = self.saved_oh_actions[:,
                                                                  1:]  # time shift
                self.saved_oh_actions[:,
                                      -1] = self.empty_oh_action[:,
                                                                 0]  # erase last value
                self.saved_oh_actions[a, -1] = 1  # insert new one-hot

                s_new, rewards, done, _ = self.env.step(agent_actions)

                # not(10 in s_new[self.agent_nr]['alive']) #if done or agent 10 is dead
                done = done or rewards[self.agent_nr] == -1
                max_ammo = max(max_ammo, s_act[self.agent_nr]['ammo'])
                # reward and buffer
                r = rewards[self.agent_nr]
                # if (10 in s_act[self.agent_nr]['alive']) and total_step!=1:
                #    r = get_reward(s_new,s_act,self.agent_nr,r,max_ammo,old_max_ammo,a,a_old,self.saved_oh_actions)
                ep_r += r
                self.A3CAgent.add_reward(r)
                if total_step % UPDATE_GLOBAL_ITER == 0 or done:  # update global and assign to local net
                    update_glob_net(self.opt, self.lnet, self.gnet,
                                    self.A3CAgent, GAMMA)
                    if done:
                        record(self.g_ep, self.g_ep_r, ep_r, self.res_queue,
                               self.global_nr_steps,
                               s_new[self.agent_nr]['step_count'], self.name)
                        break
                s_act = s_new
                old_max_ammo = max_ammo
                a_old = a
                total_step += 1
        self.res_queue.put(None)
Пример #24
0
        return (id + 10) in alive_agents

    def reset(self):
        self.prev_obs = self.env.reset()
        obs = {}
        self.reset_stat()
        for i in range(4):
            if self.is_agent_alive(i):
                obs[i] = featurize(self.prev_obs[i])

        return obs


if __name__ == '__main__':
    agent_list = [
        agents.RandomAgent(),
        agents.StaticAgent(),
        agents.StaticAgent(),
        agents.StaticAgent()
    ]
    env = pommerman.make(
        'PommeTeam-v0',
        agent_list,
        # '/home/lucius/working/projects/pomme_rllib/resources/one_line_state.json'
    )
    obs = env.reset()

    while True:
        features = featurize(obs[0])
        for i in range(17):
            print(features[i])
Пример #25
0
class PomFFA(gym.Env):
    agent_list = [
        agents.RandomAgent(),
        agents.SimpleAgent(),
        agents.SimpleAgent(),
        agents.SimpleAgent()
    ]
    all_obs = None
    all_action = None
    cur_obs = None
    alive_agents = [10, 11, 12, 13]
    player_agent_id = 10

    def __init__(self, env_config=None):

        pomme_config = pommerman.configs.ffa_competition_env()

        if env_config:
            for k, v in env_config.items():
                if k in pomme_config['env_kwargs']:
                    pomme_config['env_kwargs'][k] = v

        print("pomme_config: ")
        print(pomme_config['env_kwargs'])

        self.pomme = Pomme(**pomme_config['env_kwargs'])

        self.observation_space = self.init_observation_space(
            pomme_config['env_kwargs'])
        self.action_space = self.pomme.action_space

        if not env_config or (env_config
                              and env_config.get("is_training", True)):
            # initialize env twice could raise error here.
            self.init(pomme_config)

    def init(self, pomm_config):
        for id_, agent in enumerate(self.agent_list):
            assert isinstance(agent, agents.BaseAgent)
            print(id_, pomm_config['game_type'])
            agent.init_agent(id_, pomm_config['game_type'])
        self.pomme.set_agents(self.agent_list)
        self.pomme.set_init_game_state(None)

    def reset(self):
        obs = self.pomme.reset()
        self.all_obs = obs
        obs = self.get_for_training_agent(obs)
        self.cur_obs = obs
        obs = self.preproess(obs)
        return obs

    def get_reward(self, obs, action, agent_id):
        if len(obs["alive"]) == 1:
            # An agent won. Give them +1, others -1.
            if agent_id in obs['alive']:
                return 1
            else:
                return -1

        if obs["step_count"] >= 500:
            # Game is over from time. Everyone gets -1.
            return -1

        # Game running: 0 for alive, -1 for dead.
        if agent_id not in obs['alive']:
            return -1
        #
        # x, y = obs["position"]
        # blast = obs["bomb_blast_strength"]
        #
        # for w in range(11):
        #     if blast[x][w] > int(math.fabs(w-y)):
        #         return -10
        #
        #     if blast[w][y] > int(math.fabs((w-x))):
        #         return -10

        return 0

    def step(self, action):
        actions = self.pomme.act(self.all_obs)
        if self.alive_agents and self.player_agent_id in self.alive_agents:
            actions = self.set_for_training_agent(actions, action)
        else:
            actions = self.set_for_training_agent(actions, 0)
        obs, rewards, done, info = self.pomme.step(actions)

        # print(obs)

        self.all_obs = obs
        obs = self.get_for_training_agent(obs)
        self.cur_obs = obs
        reward = self.get_reward(self.cur_obs, action, self.player_agent_id)
        self.alive_agents = obs['alive']
        if (self.player_agent_id
                not in self.alive_agents) or obs["step_count"] >= 500:
            done = True
        obs = self.preproess(obs)
        return obs, reward, done, {}

    def get_for_training_agent(self, inputs):
        order = self.player_agent_id - 10
        return inputs[order]

    def set_for_training_agent(self, inputs, value):
        order = self.player_agent_id - 10
        inputs[order] = value
        return inputs

    def init_observation_space(self, env_config):
        """
            observations for agents
            board: n^2
            bomb blast strength: n^2
            bomb life: n^2
        """
        board_size = env_config['board_size'] or 11
        num_items = env_config['num_items'] or 11
        print("env config: {}".format(env_config))
        # board_size = 11

        board = spaces.Box(low=0,
                           high=len(constants.Item),
                           shape=(board_size, board_size))
        bomb_blast_strength = spaces.Box(low=0,
                                         high=num_items,
                                         shape=(board_size, board_size))
        bomb_life = spaces.Box(low=0, high=9, shape=(board_size, board_size))
        flame_life = spaces.Box(low=0, high=3, shape=(board_size, board_size))
        position = spaces.Box(low=0, high=board_size, shape=(2, ))
        blast_strength = spaces.Box(low=1, high=num_items, shape=(1, ))
        ammo = spaces.Box(low=0, high=num_items, shape=(1, ))
        return spaces.Dict({
            "board": board,
            "bomb_blast_strength": bomb_blast_strength,
            "bomb_life": bomb_life,
            "flame_life": flame_life,
            "position": position,
            "ammo": ammo,
            "blast_strength": blast_strength
        })

    @staticmethod
    def preproess(obs):
        del obs["game_type"]
        del obs["game_env"]
        del obs["can_kick"]
        del obs["teammate"]
        del obs["enemies"]
        del obs["step_count"]
        del obs['alive']
        del obs['bomb_moving_direction']
        obs['position'] = np.array(obs['position'])
        obs['ammo'] = np.array([obs['ammo']])
        obs['blast_strength'] = np.array([obs['blast_strength']])
        return obs

    def render(self):
        self.pomme.render()
Пример #26
0
    def __init__(self, sparrer_type, agent_id, model=None):
        super().__init__()

        if sparrer_type == constants.SIMPLE_SPARRER:
            self.modelled_env = pommerman.make('PommeTeamCompetition-v0', agent_list=[agents.SimpleAgent() for _ in range(4)])
        elif sparrer_type == constants.MODEL_SPARRER:
            # FIXME: may require changes
            self.modelled_env = pommerman.make('PommeTeamCompetition-v0', agent_list=[agents.SmartAgent(model) for _ in range(4)])
        elif sparrer_type == constants.RANDOM_SPARRER:
            # FIXME: may require changes
            self.modelled_env = pommerman.make('PommeTeamCompetition-v0', agent_list=[agents.RandomAgent() for _ in range(4)])
        else:
            raise ValueError('Invalid sparrer type')

        self.training_examples = []
        self.memory = None
        self.modelled_env.reset()
        self.agent_id = agent_id
def main():
    # Create the environment
    agent_list = [
        agents.RandomAgent(),
        agents.SimpleAgent(),
        # agents.SimpleAgent(),
        # agents.SimpleAgent(),
    ]
    env = pommerman.make('OneVsOne-v0',
                         agent_list, render_mode='human')

    if platform.system() == 'Darwin':
        print("MacBook Pro user detected. U rule.")
        os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'

    # =========== STATS =========== #

    global now
    now = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    accumulated_frames = []
    episode_rewards = []

    # =========== (HYPER)PARAMETERS AND VARIABLES =========== #

    ACTION_SPACE = env.action_space.n
    TIME_CHANNELS_SIZE = 1
    INPUT_SHAPE = list(env.get_observation_space()) + [TIME_CHANNELS_SIZE]
    N_EPISODES = 10000

    MODEL_PATH = "models/20200122-104419"
    latest = tf.train.latest_checkpoint(MODEL_PATH)
    print(f"Loading model from {latest}")

    ## - Comment 2 lines below for running Random/SimpleAgent- ##
    restored_model = create_model(INPUT_SHAPE, ACTION_SPACE)
    restored_model.load_weights(latest)

    actions_available = [str(action).split(".")[1]
                         for action in constants.Action]

    for episode in range(N_EPISODES):
        start_time = time.time()

        print(
            f"Running episode {episode}.")

        # Intial step for the episode
        state_obs = env.reset()
        actions = env.act(state_obs)
        initial_observation, reward, done, info, pixels = env.step2(
            actions, render=True)

        state = preprocess(pixels)

        done = False
        frame_cnt = 0
        accumulated_reward = 0
        action_str = ""

        while not done:
            frame_cnt += 1

            actions_all_agents = env.act(state_obs)

            ## - Comment out from here - ##
            init_mask = tf.ones([1, ACTION_SPACE])
            init_state = state

            q_values = restored_model.predict(
                [tf.reshape(init_state, [1] + INPUT_SHAPE), init_mask])

            action = np.argmax(q_values)
            # print(q_values)
            # print(
            #     f"Action taken: {actions_available[action]}") if action_str != f"Action taken: {actions_available[action]}" else None

            actions_all_agents[0] = action
            ## - Until here, when you want to use a Random/SimpleAgent instead - ##

            state_obs, reward, done, info, pixels = env.step2(
                actions_all_agents)
            state = preprocess(pixels)
            accumulated_reward += reward[0]
            action_str = f"Action taken: {actions_available[action]}"

        time_end = np.round(time.time() - start_time, 2)

        accumulated_frames.append(frame_cnt)
        episode_rewards.append(accumulated_reward)
        save_json(accumulated_frames=accumulated_frames,
                  episode_rewards=episode_rewards)

        print(f"Running at {np.round(frame_cnt / time_end)} frames per second")
Пример #28
0
 def __init__(self):
     self._agent = agents.RandomAgent()