def update_observation_for_shipyard(board: Board, uid, action):
     """Simulate environment step forward and update observation
     https://www.kaggle.com/sam/halite-sdk-overview#Simulating-Actions-(Lookahead)
     """
     ship = board.shipyards[uid]
     ship.next_action = action
     ret_val = board.next()
     return Observation(ret_val.observation)
    def __call__(self, observation: Dict[str, Any],
                 configuration: Dict[str, Any]) -> Dict[Any, str]:
        raw_observation = observation
        step_observation = Observation(observation)

        raw_observation, shipyard_simulated_step_memory = self.get_moves_for_all_shipyards(
            raw_observation=raw_observation,
            step_observation=step_observation,
            episode_number=0,
            step_number=0)

        self.get_moves_for_all_ships(raw_observation=raw_observation,
                                     step_observation=step_observation,
                                     episode_number=0,
                                     step_number=0)

        actions_for_step = self.actions_for_step
        return actions_for_step
示例#3
0
    def play_episode(self, max_steps):
        raw_observation: dict = self.env.reset()[0].__dict__
        raw_observation['players'][
            raw_observation['player']][0] = raw_observation['players'][
                raw_observation['player']][0] - self.handicap

        step_observation = Observation(raw_observation)
        episode_score = []

        print("Episode start")
        print(raw_observation['players'])

        for step in range(max_steps):
            self.step_number = step
            step_scores = self.play_step(raw_observation=raw_observation,
                                         step_observation=step_observation)
            episode_score.append(step_scores)
        self.episode_number += 1
        return episode_score
def halite_run_agent(observation, configuration):

    raw_observation = observation
    step_observation = Observation(observation)

    raw_observation, shipyard_simulated_step_memory = halite_agent.get_moves_for_all_shipyards(
        raw_observation=raw_observation,
        step_observation=step_observation,
        episode_number=self.episode_number,
        step_number=self.step_number)

    raw_observation, ship_simulated_step_memory = halite_agent.get_moves_for_all_ships(
        raw_observation=raw_observation,
        step_observation=step_observation,
        episode_number=self.episode_number,
        step_number=self.step_number)

    actions_for_step = halite_agent.actions_for_step
    return actions_for_step
    def play_episode(self, max_steps):
        raw_observation: dict = self.env.reset()[0].__dict__
        raw_observation['players'][
            raw_observation['player']][0] = raw_observation['players'][
                raw_observation['player']][0] - self.handicap

        episode_scores = []

        if self.episode_number % 5 == 0:
            print("Episode {}".format(self.episode_number))
            print(raw_observation['players'])

        for step in range(max_steps):
            self.step_number = step
            step_observation = Observation(raw_observation)
            raw_observation, done, step_scores = self.play_step(
                raw_observation=raw_observation,
                step_observation=step_observation)
            episode_scores.append(step_scores)
            if done:
                return episode_scores
        self.episode_number = 1 + self.episode_number
        return episode_scores
示例#6
0
def _run(args):
    print('Running')

    env = make("halite", {'episodeSteps': 200}, debug=True)

    """
    Setup all of the hyperparameters and other inputs
    """
    try:
        learn_rate = args[0].learning_rate
        discount_factor = args[0].discount_factor
        batch_size = args[0].batch_size
        layer_1_dims_ship = args[0].layer_1_dims_ship
        layer_1_dims_shipyard = args[0].layer_1_dims_shipyard
        epsilon = args[0].epsilon
        if args[0].mode == 'Train':
            # Add timestamp; important for HP tuning so models don't clobber each
            # other.
            model_dir = hp_directory(args[0].model_dir)
        else:
            model_dir = args[0].model_dir
        writer = tf.summary.create_file_writer(os.path.join(model_dir, 'metrics'))
    except Exception as e:
        print('Failure parsing args')
        print(e)
        raise e
    try:

        """
        Extra helpers needed for halite
        """

        ship_state_wrapper = ShipStateWrapper(
            radius=5,
            max_frames=2,
            map_size=env.configuration['size']
        )
        shipyard_state_wrapper = ShipYardStateWrapper(
            radius=5,
            max_frames=1,
            map_size=env.configuration['size']
        )

        ship_agent = Agent(
            alpha=learn_rate,
            gamma=discount_factor,
            n_actions=6,
            batch_size=batch_size,
            epsilon=epsilon,
            input_dims=ship_state_wrapper.state_size,
            fc1_dims=layer_1_dims_ship
        )

        shipyard_agent = Agent(
            alpha=learn_rate,
            gamma=discount_factor,
            n_actions=2,
            batch_size=batch_size,
            epsilon=epsilon,
            input_dims=shipyard_state_wrapper.state_size,
            fc1_dims=layer_1_dims_shipyard
        )

        halite_env = HaliteEnv(
            environment=env,
            opponents=[do_nothing_agent],
            ship_state_wrapper=ship_state_wrapper,
            shipyard_state_wrapper=shipyard_state_wrapper,
            ship_reward_type='total_halite'
        )

        halite_agent = HaliteAgent(
            ship_agent=ship_agent,
            shipyard_agent=shipyard_agent,
            configuration=env.configuration,
            halite_env=halite_env,
            ship_state_wrapper=ship_state_wrapper,
            shipyard_state_wrapper=shipyard_state_wrapper
        )
    except Exception as e:
        print('Failed setting up Halite Utilities')
        print(e)
        raise e

    print("STARTING...")

    """
    Loading previous model if possible
    """

    if not os.path.exists(model_dir) and args[0].save_model:
        os.makedirs(model_dir)
    if not os.path.exists(os.path.join(model_dir, 'results')) and args[0].save_model:
        os.makedirs(os.path.join(model_dir, 'results'))
    print("MODEL WILL BE STORED AT: ", model_dir)

    if args[0].mode != 'Train':
        trained_model_path = args[0].load_model
        try:
            ship_agent.load_weights(trained_model_path)
        except:
            print('{} is not a valid .h5 model.'.format(trained_model_path))

    episode_reward, episode_number, done = 0, 0, False

    if done or args[0].mode != 'Train':
        state = env.reset()
    else:
        state, board = halite_env.reset()

    print('Beginning training loop')

    episode_results = []

    for curr_step in range(args[0].steps):

        raw_observation = state
        step_observation = Observation(raw_observation)

        """
        If we are not in training mode
        
            episode_reward = agent.play(env, model_dir, args[0].mode)
            print('CURRENT STEP: {}, EPISODE_NUMBER: {}, EPISODE REWARD: {},'
                  'EPSILON: {}'.format(curr_step, episode_number, episode_reward,
                                       eta))
            episode_run = False
        
        """

        """
        ==========
        TRAINING
        ==========
        """
        if args[0].mode == 'Train':
            # not sure how this step works
            # eta = anneal_exploration(eta, curr_step, args[0].steps / 10.0,
            #                          args[0].start_train, args[0].init_eta,
            #                          args[0].min_eta, 'linear')

            """
            Choose Actions.
            
            We will need an extra step here due to the simulation effect.
            
            if eta > np.random.rand() or curr_step < args[0].start_train:
                action = env.action_space.sample()
            else:
                action = agent.predict_action(state)
            """

            raw_observation, shipyard_simulated_step_memory = halite_agent.get_moves_for_all_shipyards(
                raw_observation=raw_observation,
                step_observation=step_observation,
                episode_number=episode_number,
                step_number=curr_step
            )

            raw_observation, ship_simulated_step_memory = halite_agent.get_moves_for_all_ships(
                raw_observation=raw_observation,
                step_observation=step_observation,
                episode_number=episode_number,
                step_number=curr_step
            )

            actions_for_step = {}

            for id_, action in halite_agent.actions_for_step.items():
                actions_for_step[id_] = action

            """
            Then we take the step.
            """

            next_state, reward, done = halite_env.step(actions=actions_for_step)

            """
            Add to the replay buffer.
            
             next_state, reward, done, info = env.step(action)
            Buffer.add_exp([state, next_state, reward, action, done])
            ready_to_update_model = curr_step > args[0].start_train and len(
                Buffer.buffer) > Buffer.min_size
            
            """

            halite_agent.learn(
                observation=next_state,
                shipyard_simulated_step_memory=shipyard_simulated_step_memory,
                ship_simulated_step_memory=ship_simulated_step_memory,
                episode_number=episode_number,
                step_number=curr_step,
                terminal=done
            )

            if (curr_step % SAVE_STEPS_FACTOR) == 0:
                ship_model_path_full = os.path.join(model_dir, 'ship_agent/')

                try:
                    print(f'Saving model at {ship_model_path_full}, Step: {curr_step} - Episode: {episode_number}')
                    ship_agent.save_weights(
                        model_path=ship_model_path_full
                    )
                    shipyard_agent.save_weights(
                        model_path=os.path.join(model_dir, 'shipyard_agent/')
                    )
                except Exception as e:
                    print('Failed to save weights')
                    print(e)
                    raise e

            scores = [vals[0] for vals in next_state.players]

            episode_results.append(scores)

            opponent = 0 if next_state.player else 1
            player_score = next_state.players[next_state.player][0]
            player_won = player_score > next_state.players[opponent][0]

            if (curr_step % DISPLAY_RESULTS) == 0:
                print(f'Scores {scores}')
                with writer.as_default():
                    tf.summary.scalar(name="SCORE", data=player_score, step=curr_step)
                    tf.summary.scalar(name="OPP_SCORE", data=next_state.players[opponent][0], step=curr_step)
                    writer.flush()

            """
            Update Model if conditions are met
            
            if ready_to_update_model:
                exp_state, exp_next_state, exp_reward, exp_action, exp_done = Buffer.sample_experiences(
                    args[0].batch_size)
                agent.batch_train(exp_state, exp_next_state, exp_reward, exp_action,
                                  exp_done, target_network, args[0].Q_learning)
                if curr_step % args[0].update_target == 0:
                    target_network.set_weights(agent.model.get_weights())
                if curr_step % (SAVE_STEPS_FACTOR *
                                args[0].update_target) == 0 and args[0].save_model:
            """

            """
            Save model if desired
                    models.save_model(
                        agent.model,
                        model_dir + 'model_' + str(episode_number) + '_.h5'
                    )
            """
            state = next_state

            # Resets state
            if done:
                print('Game done')
                with writer.as_default():
                    tf.summary.scalar(name="GAME_RESULT", data=player_won, step=curr_step)
                    writer.flush()
                print('Wrote summary stats')

                with open(os.path.join(model_dir, 'results', f"{episode_number}_results.csv"), 'w') as f:
                    pd.DataFrame(episode_results).to_csv(f)
                    f.close()

                episode_number += 1
                if args[0].mode != 'Train':
                    episode_run = True
                    state = env.reset()
                else:
                    state, board = halite_env.reset()
                done = False
示例#7
0
def play_episode(env: HaliteEnv,
                 ship_agent: Agent,
                 shipyard_agent: Agent,
                 configuration,
                 n_steps: int = 10,
                 verbose: bool = True,
                 training: bool = False,
                 simulated_step_learning: bool = False,
                 episode_number=0):
    episode_rewards = []
    episode_actions = []

    episode_scores = []

    raw_observation: dict = env.reset()[0].__dict__
    print('ep: {}'.format(episode_number))
    done = False

    for step_num in range(n_steps):
        if done:
            board = Board(raw_observation, raw_configuration=configuration)
            print('Done')
            print(board)
            return episode_scores

        actions_for_step = {}

        # wont change
        step_observation = Observation(raw_observation)

        shipyard_temporary_initial_memory = {}
        ship_temporary_initial_memory = {}
        """
        ====================================
        ====================================
        SHIPYARDS
        ====================================
        ====================================
        """
        for shipyard_id, (pos) in step_observation.players[
                step_observation.player][1].items():
            # will change at each simulated step
            board = Board(raw_observation, raw_configuration=configuration)
            observation = Observation(raw_observation)

            # Select action
            converted_observation, is_occupied = env.wrap_observation_for_shipyard_agent(
                obs=observation,
                player=observation.player,
                spos=pos,
                uid=shipyard_id)
            state_vector = converted_observation.flatten()
            state_vector: np.ndarray = np.append(state_vector, is_occupied)

            action = shipyard_agent.get_action(state_vector,
                                               step=step_num,
                                               game=episode_number)
            halite_action = env.convert_shipyard_action_to_halite_enum(
                action, shipyard_id, observation)
            episode_actions.append(halite_action)

            # re-aligning action and halite action
            # TODO: should refactor
            if halite_action == ShipyardAction.SPAWN:
                action = 1
            else:
                action = 0

            if halite_action:
                actions_for_step[shipyard_id] = halite_action.name
            """
            ============
            Take Action
            ============
            """
            prev_obs = observation
            obs_next = env.update_observation_for_shipyard(
                board, shipyard_id, halite_action)

            reward = env.get_shipyard_reward(
                obs_next,
                env.wrap_observation_for_ship_agent(
                    obs=obs_next,
                    player=obs_next.player,
                    spos=pos,  # because shipyards can't move
                    uid=shipyard_id),
                uid=shipyard_id,
                done=done)

            episode_rewards.append(reward)
            """
            ============
            Update Model
            ============
            """

            converted_next_obs, is_occupied_next = env.wrap_observation_for_shipyard_agent(
                obs_next, obs_next.player, spos=pos, uid=shipyard_id)
            next_state_vector = converted_next_obs.flatten()
            next_state_vector: np.ndarray = np.append(next_state_vector,
                                                      is_occupied_next)

            if training:
                if simulated_step_learning:
                    shipyard_agent.remember(state=state_vector,
                                            action=action,
                                            reward=reward,
                                            new_state=next_state_vector,
                                            done=done)
                    shipyard_agent.learn(step_num=step_num,
                                         episode_num=episode_number)
                else:
                    shipyard_temporary_initial_memory[shipyard_id] = {
                        'state': state_vector,
                        'action': action,
                        'pos': pos,
                        'is_occupied': is_occupied
                    }

            if verbose and ((n_steps % 5) == 0):
                print(
                    f"Step {step_num}: Action taken {action} for shipyard {shipyard_id}, "
                    f"reward received {reward}")
            # update current observation with the simulated step ahead
            raw_observation = obs_next
        """
        ====================================
        ====================================
        SHIPS
        ====================================
        ====================================
        """
        for ship_id, (pos, halite) in step_observation.players[
                step_observation.player][2].items():
            # will change at each simulated step
            board = Board(raw_observation, raw_configuration=configuration)
            observation = Observation(raw_observation)
            """
            ============
            Take Action
            ============
            """
            converted_observation = env.wrap_observation_for_ship_agent(
                obs=Observation(board.observation),
                player=board.observation['player'],
                spos=int(pos),
                uid=ship_id)
            state_vector = converted_observation.flatten()
            action = ship_agent.get_action(state_vector,
                                           step=step_num,
                                           game=episode_number)
            episode_actions.append(action)

            halite_action = env.convert_ship_action_to_halite_enum(
                action, observation)

            if halite_action and halite_action.name == halite_action.CONVERT.name and \
                observation.players[observation.player][0] < 500:
                # tried to convert without enough halite
                halite_action = None
                action = 5

            if halite_action:
                actions_for_step[ship_id] = halite_action.name

            # Take action
            prev_obs = observation
            try:
                obs_next: Observation = env.update_observation_for_ship(
                    board, ship_id, halite_action)
            except KeyError as e:
                print('Actions taken')
                print(actions_for_step)
                print('Current board and observation')
                print(board.ships.keys())
                print(observation.players[observation.player])
                print('Initial board and observation')
                print(step_observation.players[step_observation.player])
                raise e

            # the ship may no longer exist...
            # ie it collided with an enemy ship or converted to a shipyard, we need to use the previous
            # for now we will use the new position IF it exists, otherwise just use the old one
            next_pos = obs_next.players[observation.player][2].get(
                ship_id, (None, None))[0]

            if not next_pos:
                next_pos = int(pos)

            reward = env.get_collector_ship_reward(
                obs_next,
                env.wrap_observation_for_ship_agent(
                    obs=obs_next,
                    player=obs_next.player,
                    spos=pos,  # because shipyards can't move
                    uid=ship_id),
                ship_id,
                done=done)

            episode_rewards.append(reward)
            """
            ============
            Update Model
            ============
            """

            converted_next_obs = env.wrap_observation_for_ship_agent(
                obs=obs_next,
                player=obs_next.player,
                spos=next_pos,
                uid=ship_id)
            next_state_vector = converted_next_obs.flatten()

            # Update model
            if training:
                if simulated_step_learning:
                    ship_agent.remember(state=state_vector,
                                        action=action,
                                        reward=reward,
                                        new_state=next_state_vector,
                                        done=done)
                    ship_agent.learn(step_num=step_num,
                                     episode_num=episode_number)
                else:
                    ship_temporary_initial_memory[ship_id] = {
                        'state': state_vector,
                        'action': action,
                        'pos': pos
                    }
            action_string = halite_action.name if halite_action else 'None'

            if verbose and ((n_steps % 5) == 0):
                print(
                    f"Step {step_num}: Action taken {action} | {action_string} for ship {ship_id}, "
                    f"reward received {reward}")
            # update current observation with the simulated step ahead
            raw_observation = obs_next
        """
        ================        
        ================
        == Take Step
        ================
        ================
        """

        # updates the env.observation
        step_results = env.step(actions=actions_for_step)

        print('Actions for step')
        print(actions_for_step)

        observation, game_reward, terminal = step_results

        if not simulated_step_learning:
            """
            Here we are doing learning after the actual "step" has taken place.

            This means that the earlier a ship or shipyard has selected its move, 
            the more unknowns and more "friendly reactions" that can occur afterwards.

            It would probably be very useful to include 
                - remaining_ship_actions
                - remaining_shipyard_actions
                - and potentially the current epsilon value
            as a part of the state.
            """

            player_halite = observation.players[observation.player][0]
            opponent_halites = [
                item[0] for item in observation.players[observation.player:]
            ]
            best_opponent_halite = sorted(opponent_halites, reverse=True)[0]

            for ship_id, val in ship_temporary_initial_memory.items():
                s = val['state']
                a = val['action']
                pos = val['pos']
                converted_next_obs = env.wrap_observation_for_ship_agent(
                    obs=Observation(observation),
                    player=observation['player'],
                    spos=int(pos),
                    uid=ship_id)
                ship_reward = env.get_collector_ship_reward(
                    observation=observation,
                    converted_observation=converted_next_obs,
                    uid=ship_id,
                    done=done)
                next_state_vector = converted_next_obs.flatten()
                ship_agent.remember(state=s,
                                    action=a,
                                    reward=ship_reward,
                                    new_state=next_state_vector,
                                    done=done)
                ship_agent.learn(step_num=step_num, episode_num=episode_number)

            for shipyard_id, val in shipyard_temporary_initial_memory.items():
                s = val['state']
                a = val['action']
                pos = val['pos']
                is_occupied = val['is_occupied']
                converted_next_obs, is_occupied_next = env.wrap_observation_for_shipyard_agent(
                    obs=Observation(observation),
                    player=observation['player'],
                    spos=int(pos),
                    uid=shipyard_id)
                print('For action: {}'.format(a))
                shipyard_reward = env.get_shipyard_count_reward(
                    observation=observation,
                    converted_observation=converted_next_obs)
                next_state_vector = converted_next_obs.flatten()
                next_state_vector: np.ndarray = np.append(
                    next_state_vector, is_occupied_next)

                shipyard_agent.remember(state=s,
                                        action=a,
                                        reward=shipyard_reward,
                                        new_state=next_state_vector,
                                        done=done)
                shipyard_agent.learn(step_num=step_num,
                                     episode_num=episode_number)

        episode_scores.append([item[0] for item in observation['players']])
        raw_observation = observation

    return episode_scores
    def play_step(self, raw_observation, step_observation):
        self.actions_for_step = {}
        self.shipyard_step_memory = {}
        self.ship_step_memory = {}

        self.step_number += 1

        raw_observation, shipyard_simulated_step_memory = self.get_moves_for_all_shipyards(
            raw_observation=raw_observation, step_observation=step_observation)

        raw_observation, ship_simulated_step_memory = self.get_moves_for_all_ships(
            raw_observation=raw_observation, step_observation=step_observation)

        env = self.env
        actions_for_step = self.actions_for_step

        # updates the env.observation
        step_results = env.step(actions=actions_for_step)

        observation, game_reward, terminal = step_results

        if self.training:
            """
            Here we are doing learning after the actual "step" has taken place.

            This means that the earlier a ship or shipyard has selected its move, 
            the more unknowns and more "friendly reactions" that can occur afterwards.

            It would probably be very useful to include 
                - remaining_ship_actions
                - remaining_shipyard_actions
                - and potentially the current epsilon value
            as a part of the state.
            """
            for ship_id, val in ship_simulated_step_memory.items():
                s = val['state']
                a = val['action']
                pos = val['pos']

                if self.step_number >= self.ship_frame_stack_len:

                    if self.ship_frame_stack_len > 1:
                        multiframe_state = self.env.get_multiframe_ship_observation(
                            ship_id)
                        converted_obs = np.concatenate(multiframe_state,
                                                       axis=0)
                        s = converted_obs.flatten()

                    converted_next_obs = env.wrap_observation_for_ship_agent(
                        obs=Observation(observation),
                        player=observation['player'],
                        spos=int(pos),
                        uid=ship_id)

                    if self.ship_frame_stack_len > 1:
                        multiframe_state = self.env.get_multiframe_ship_observation(
                            ship_id)
                        converted_next_obs = np.concatenate(multiframe_state,
                                                            axis=0)

                    ship_reward = env.get_ship_reward(
                        observation=observation,
                        converted_observation=converted_next_obs,
                        uid=ship_id,
                        done=terminal)

                    next_state_vector = converted_next_obs.flatten()
                    self.ship_agent.remember(state=s,
                                             action=a,
                                             reward=ship_reward,
                                             new_state=next_state_vector,
                                             done=terminal)
                    self.ship_agent.learn(step_num=self.step_number,
                                          episode_num=self.episode_number)

            for shipyard_id, val in shipyard_simulated_step_memory.items():
                s = val['state']
                a = val['action']
                pos = val['pos']
                is_occupied = val['is_occupied']
                converted_next_obs, is_occupied_next = env.wrap_observation_for_shipyard_agent(
                    obs=Observation(observation),
                    player=observation['player'],
                    spos=int(pos),
                    uid=shipyard_id)
                shipyard_reward = env.get_shipyard_reward(
                    observation=observation,
                    converted_observation=converted_next_obs,
                    uid=shipyard_id,
                    done=terminal)
                next_state_vector = converted_next_obs.flatten()
                next_state_vector: np.ndarray = np.append(
                    next_state_vector, is_occupied_next)

                self.shipyard_agent.remember(state=s,
                                             action=a,
                                             reward=shipyard_reward,
                                             new_state=next_state_vector,
                                             done=terminal)
                self.shipyard_agent.learn(step_num=self.step_number,
                                          episode_num=self.episode_number)

        return terminal, [item[0] for item in observation.players]
    def get_single_ship_move(self, ship_id, pos, step_observation,
                             raw_observation, ship_simulated_step_memory):
        done = False

        board = Board(raw_observation, raw_configuration=self.configuration)
        observation = Observation(raw_observation)
        """
        ============
        Take Action
        ============
        """
        converted_observation = self.env.wrap_observation_for_ship_agent(
            obs=Observation(board.observation),
            player=board.observation['player'],
            spos=int(pos),
            uid=ship_id)
        state_vector = converted_observation.flatten()

        if self.ship_frame_stack_len > 1:
            multiframe_state = self.env.get_multiframe_ship_observation(
                ship_id)
            converted_obs = np.concatenate(multiframe_state, axis=0)
            state_vector = converted_obs.flatten()
        if len(self.env.get_multiframe_ship_observation(
                ship_id)) == self.ship_frame_stack_len:
            action = self.ship_agent.get_action(state_vector,
                                                step=self.step_number,
                                                game=self.episode_number)
        else:
            action = np.random.randint(0, 6)

        self.episode_actions.append(action)

        halite_action = self.env.convert_ship_action_to_halite_enum(
            action, observation)

        if halite_action and halite_action.name == halite_action.CONVERT.name and \
                observation.players[observation.player][0] < 500:
            # tried to convert without enough halite
            halite_action = None
            action = 5

        if halite_action:
            self.actions_for_step[ship_id] = halite_action.name

        # Take action
        try:
            obs_next: Observation = self.env.update_observation_for_ship(
                board, ship_id, halite_action)
        except KeyError as e:
            print('Actions taken')
            print(self.actions_for_step)
            print('Initial board and observation')
            print(step_observation.players[step_observation.player])
            raise e

        # the ship may no longer exist...
        # ie it collided with an enemy ship or converted to a shipyard, we need to use the previous
        # for now we will use the new position IF it exists, otherwise just use the old one
        # next_pos = obs_next.players[observation.player][2].get(ship_id, (None, None))[0]
        """
        ============
        Prepare for Model Update
        ============
        """

        # Update model
        if self.training:
            ship_simulated_step_memory[ship_id] = {
                'state': state_vector,
                'action': action,
                'pos': pos
            }
        action_string = halite_action.name if halite_action else 'None'

        if self.verbose and ((self.step_number % 10) == 0):
            print(
                f"Step {self.step_number}: Action taken {action} | {action_string} for ship {ship_id}, "
                f"reward received N/A | Player state {obs_next.players[observation.player]}"
            )
        # update current observation with the simulated step ahead
        raw_observation = obs_next

        return raw_observation
示例#10
0
    def get_single_shipyard_move(self, shipyard_id, pos, step_observation,
                                 raw_observation,
                                 shipyard_temporary_initial_memory):
        configuration = self.configuration
        board = Board(raw_observation, raw_configuration=configuration)
        observation = Observation(raw_observation)

        verbose = self.verbose
        done = False

        # Select action
        converted_observation, is_occupied = self.env.wrap_observation_for_shipyard_agent(
            obs=observation,
            player=observation.player,
            spos=pos,
            uid=shipyard_id)
        state_vector = converted_observation.flatten()
        state_vector: np.ndarray = np.append(state_vector, is_occupied)

        action = self.shipyard_agent.get_action(state_vector,
                                                step=self.step_number,
                                                game=self.episode_number)
        halite_action = self.env.convert_shipyard_action_to_halite_enum(
            action, shipyard_id, observation)
        self.episode_actions.append(halite_action)
        """
        ============
        Take Action
        ============
        """
        obs_next = self.env.update_observation_for_shipyard(
            board, shipyard_id, halite_action)

        reward = self.env.get_shipyard_reward(
            obs_next,
            self.env.wrap_observation_for_ship_agent(
                obs=obs_next,
                player=obs_next.player,
                spos=pos,  # because shipyards can't move
                uid=shipyard_id),
            uid=shipyard_id,
            done=done)

        self.episode_rewards.append(reward)
        """
        ============
        Prepare for Update Model
        ============
        """

        if self.training:
            shipyard_temporary_initial_memory[shipyard_id] = {
                'state': state_vector,
                'action': action,
                'pos': pos,
                'is_occupied': is_occupied
            }

        if verbose and ((self.step_number % 10) == 0):
            print(
                f"Step {self.step_number}: Action taken {action} for shipyard {shipyard_id}, "
                f"reward received {reward}")
        # update current observation with the simulated step ahead
        raw_observation = obs_next

        return raw_observation
    def learn(self, observation, game_reward, terminal,
              ship_simulated_step_memory, shipyard_simulated_step_memory,
              step_number, episode_number):
        env = self.env
        """
        Here we are doing learning after the actual "step" has taken place.

        This means that the earlier a ship or shipyard has selected its move, 
        the more unknowns and more "friendly reactions" that can occur afterwards.

        It would probably be very useful to include 
            - remaining_ship_actions
            - remaining_shipyard_actions
            - and potentially the current epsilon value
        as a part of the state.
        """
        remaining = len(ship_simulated_step_memory)

        for ship_id, val in ship_simulated_step_memory.items():
            s = val['state']
            a = val['action']
            pos = val['pos']

            if step_number >= self.ship_frame_stack_len:

                if self.ship_frame_stack_len > 1:
                    multiframe_state = self.env.get_multiframe_ship_observation(
                        ship_id)
                    converted_obs = np.concatenate(multiframe_state, axis=0)
                    s = converted_obs.flatten()

                converted_next_obs = env.wrap_observation_for_ship_agent(
                    obs=Observation(observation),
                    player=observation['player'],
                    remaining=remaining,
                    turn=step_number,
                    spos=int(pos),
                    uid=ship_id)

                remaining -= 1

                if self.ship_frame_stack_len > 1:
                    multiframe_state = self.env.get_multiframe_ship_observation(
                        ship_id)
                    converted_next_obs = np.concatenate(multiframe_state,
                                                        axis=0)

                ship_reward = env.get_ship_reward(
                    observation=observation,
                    converted_observation=converted_next_obs,
                    uid=ship_id,
                    done=terminal)

                if len(converted_next_obs) >= self.ship_frame_stack_len:
                    next_state_vector = converted_next_obs.flatten()
                    try:
                        self.ship_agent.remember(state=s,
                                                 action=a,
                                                 reward=ship_reward,
                                                 new_state=next_state_vector,
                                                 done=terminal)
                        self.ship_agent.learn(step_num=step_number,
                                              episode_num=episode_number)
                    except Exception as e:
                        print('shapes')
                        print(s.shape)
                        print(next_state_vector.shape)
                        raise e

        for shipyard_id, val in shipyard_simulated_step_memory.items():
            s = val['state']
            a = val['action']
            pos = val['pos']
            converted_next_obs = env.wrap_observation_for_shipyard_agent(
                obs=Observation(observation),
                player=observation['player'],
                spos=int(pos),
                uid=shipyard_id)
            shipyard_reward = env.get_shipyard_reward(
                observation=observation,
                converted_observation=converted_next_obs,
                uid=shipyard_id,
                done=terminal)
            next_state_vector = converted_next_obs.flatten()

            self.shipyard_agent.remember(state=s,
                                         action=a,
                                         reward=shipyard_reward,
                                         new_state=next_state_vector,
                                         done=terminal)
            self.shipyard_agent.learn(step_num=step_number,
                                      episode_num=episode_number)

        self.reset_step_actions()
        return terminal, [item[0] for item in observation.players]
    def get_single_shipyard_move(
        self,
        shipyard_id,
        pos,
        step_observation,
        raw_observation,
        shipyard_temporary_initial_memory,
        step_number=0,
        episode_number=0,
    ):
        configuration = self.configuration
        board = Board(raw_observation, raw_configuration=configuration)
        observation = Observation(raw_observation)

        verbose = self.verbose
        done = False

        # Select action
        converted_observation = self.shipyard_state_wrapper.get_basic_single_frame_complete_observation(
            obs=observation,
            player=observation.player,
            sy_pos=pos,
            uid=shipyard_id)
        state_vector = converted_observation

        player_state = step_observation.players[step_observation.player]

        if len(player_state[2]) == 0 and player_state[0] > 500:
            action = 1
        else:
            action = self.shipyard_agent.get_action(state_vector,
                                                    step=step_number,
                                                    game=episode_number)
        halite_action = self.shipyard_state_wrapper.convert_action_to_enum(
            shipyard_id, observation, action)

        if halite_action:
            self.actions_for_step[shipyard_id] = halite_action.name
        """
        ============
        Take Action
        ============
        """
        obs_next = self.env.update_observation_for_shipyard(
            board, shipyard_id, halite_action)

        reward = self.env.get_shipyard_reward(
            obs_next,
            self.env.wrap_observation_for_shipyard_agent(
                obs=obs_next,
                player=obs_next.player,
                spos=pos,  # because shipyards can't move
                uid=shipyard_id),
            uid=shipyard_id,
            done=done)
        """
        ============
        Prepare for Update Model
        ============
        """

        is_occupied = state_vector[-2]

        if self.training:
            shipyard_temporary_initial_memory[shipyard_id] = {
                'state': state_vector,
                'action': action,
                'pos': pos,
                'is_occupied': is_occupied
            }

        if verbose and ((step_number % 10) == 0):
            print(
                f"Step {step_number}: Action taken {action} for shipyard {shipyard_id}, "
                f"reward received {reward}")
        # update current observation with the simulated step ahead
        raw_observation = obs_next

        return raw_observation
示例#13
0
 def setUp(self):
     obs = Observation(test_state['observation'])
     self.obs = obs
     self.ship_state_wrapper = ShipStateWrapper(radius=2,
                                                max_frames=2,
                                                map_size=8)
示例#14
0
from kaggle_environments import make
from kaggle_environments.envs.halite.helpers import Board, ShipAction, ShipyardAction, Observation


env = make("halite", configuration={"episodeSteps": 10, "size": 8})


env.run(["random", "random"])


observation = Observation(env.state[0]['observation'])


print(env.configuration)

print('HALITE')
print(observation.halite)

print('PLAYERS')
print(observation.players)
示例#15
0
 def step(self, actions) -> Tuple[Observation, float, bool]:
     """Step forward in actual environment"""
     self.observation, reward, terminal, info = self.trainer.step(actions)
     self.observation = Observation(self.observation)
     terminal = terminal if terminal else 0
     return self.observation, reward, terminal