def update_observation_for_shipyard(board: Board, uid, action): """Simulate environment step forward and update observation https://www.kaggle.com/sam/halite-sdk-overview#Simulating-Actions-(Lookahead) """ ship = board.shipyards[uid] ship.next_action = action ret_val = board.next() return Observation(ret_val.observation)
def __call__(self, observation: Dict[str, Any], configuration: Dict[str, Any]) -> Dict[Any, str]: raw_observation = observation step_observation = Observation(observation) raw_observation, shipyard_simulated_step_memory = self.get_moves_for_all_shipyards( raw_observation=raw_observation, step_observation=step_observation, episode_number=0, step_number=0) self.get_moves_for_all_ships(raw_observation=raw_observation, step_observation=step_observation, episode_number=0, step_number=0) actions_for_step = self.actions_for_step return actions_for_step
def play_episode(self, max_steps): raw_observation: dict = self.env.reset()[0].__dict__ raw_observation['players'][ raw_observation['player']][0] = raw_observation['players'][ raw_observation['player']][0] - self.handicap step_observation = Observation(raw_observation) episode_score = [] print("Episode start") print(raw_observation['players']) for step in range(max_steps): self.step_number = step step_scores = self.play_step(raw_observation=raw_observation, step_observation=step_observation) episode_score.append(step_scores) self.episode_number += 1 return episode_score
def halite_run_agent(observation, configuration): raw_observation = observation step_observation = Observation(observation) raw_observation, shipyard_simulated_step_memory = halite_agent.get_moves_for_all_shipyards( raw_observation=raw_observation, step_observation=step_observation, episode_number=self.episode_number, step_number=self.step_number) raw_observation, ship_simulated_step_memory = halite_agent.get_moves_for_all_ships( raw_observation=raw_observation, step_observation=step_observation, episode_number=self.episode_number, step_number=self.step_number) actions_for_step = halite_agent.actions_for_step return actions_for_step
def play_episode(self, max_steps): raw_observation: dict = self.env.reset()[0].__dict__ raw_observation['players'][ raw_observation['player']][0] = raw_observation['players'][ raw_observation['player']][0] - self.handicap episode_scores = [] if self.episode_number % 5 == 0: print("Episode {}".format(self.episode_number)) print(raw_observation['players']) for step in range(max_steps): self.step_number = step step_observation = Observation(raw_observation) raw_observation, done, step_scores = self.play_step( raw_observation=raw_observation, step_observation=step_observation) episode_scores.append(step_scores) if done: return episode_scores self.episode_number = 1 + self.episode_number return episode_scores
def _run(args): print('Running') env = make("halite", {'episodeSteps': 200}, debug=True) """ Setup all of the hyperparameters and other inputs """ try: learn_rate = args[0].learning_rate discount_factor = args[0].discount_factor batch_size = args[0].batch_size layer_1_dims_ship = args[0].layer_1_dims_ship layer_1_dims_shipyard = args[0].layer_1_dims_shipyard epsilon = args[0].epsilon if args[0].mode == 'Train': # Add timestamp; important for HP tuning so models don't clobber each # other. model_dir = hp_directory(args[0].model_dir) else: model_dir = args[0].model_dir writer = tf.summary.create_file_writer(os.path.join(model_dir, 'metrics')) except Exception as e: print('Failure parsing args') print(e) raise e try: """ Extra helpers needed for halite """ ship_state_wrapper = ShipStateWrapper( radius=5, max_frames=2, map_size=env.configuration['size'] ) shipyard_state_wrapper = ShipYardStateWrapper( radius=5, max_frames=1, map_size=env.configuration['size'] ) ship_agent = Agent( alpha=learn_rate, gamma=discount_factor, n_actions=6, batch_size=batch_size, epsilon=epsilon, input_dims=ship_state_wrapper.state_size, fc1_dims=layer_1_dims_ship ) shipyard_agent = Agent( alpha=learn_rate, gamma=discount_factor, n_actions=2, batch_size=batch_size, epsilon=epsilon, input_dims=shipyard_state_wrapper.state_size, fc1_dims=layer_1_dims_shipyard ) halite_env = HaliteEnv( environment=env, opponents=[do_nothing_agent], ship_state_wrapper=ship_state_wrapper, shipyard_state_wrapper=shipyard_state_wrapper, ship_reward_type='total_halite' ) halite_agent = HaliteAgent( ship_agent=ship_agent, shipyard_agent=shipyard_agent, configuration=env.configuration, halite_env=halite_env, ship_state_wrapper=ship_state_wrapper, shipyard_state_wrapper=shipyard_state_wrapper ) except Exception as e: print('Failed setting up Halite Utilities') print(e) raise e print("STARTING...") """ Loading previous model if possible """ if not os.path.exists(model_dir) and args[0].save_model: os.makedirs(model_dir) if not os.path.exists(os.path.join(model_dir, 'results')) and args[0].save_model: os.makedirs(os.path.join(model_dir, 'results')) print("MODEL WILL BE STORED AT: ", model_dir) if args[0].mode != 'Train': trained_model_path = args[0].load_model try: ship_agent.load_weights(trained_model_path) except: print('{} is not a valid .h5 model.'.format(trained_model_path)) episode_reward, episode_number, done = 0, 0, False if done or args[0].mode != 'Train': state = env.reset() else: state, board = halite_env.reset() print('Beginning training loop') episode_results = [] for curr_step in range(args[0].steps): raw_observation = state step_observation = Observation(raw_observation) """ If we are not in training mode episode_reward = agent.play(env, model_dir, args[0].mode) print('CURRENT STEP: {}, EPISODE_NUMBER: {}, EPISODE REWARD: {},' 'EPSILON: {}'.format(curr_step, episode_number, episode_reward, eta)) episode_run = False """ """ ========== TRAINING ========== """ if args[0].mode == 'Train': # not sure how this step works # eta = anneal_exploration(eta, curr_step, args[0].steps / 10.0, # args[0].start_train, args[0].init_eta, # args[0].min_eta, 'linear') """ Choose Actions. We will need an extra step here due to the simulation effect. if eta > np.random.rand() or curr_step < args[0].start_train: action = env.action_space.sample() else: action = agent.predict_action(state) """ raw_observation, shipyard_simulated_step_memory = halite_agent.get_moves_for_all_shipyards( raw_observation=raw_observation, step_observation=step_observation, episode_number=episode_number, step_number=curr_step ) raw_observation, ship_simulated_step_memory = halite_agent.get_moves_for_all_ships( raw_observation=raw_observation, step_observation=step_observation, episode_number=episode_number, step_number=curr_step ) actions_for_step = {} for id_, action in halite_agent.actions_for_step.items(): actions_for_step[id_] = action """ Then we take the step. """ next_state, reward, done = halite_env.step(actions=actions_for_step) """ Add to the replay buffer. next_state, reward, done, info = env.step(action) Buffer.add_exp([state, next_state, reward, action, done]) ready_to_update_model = curr_step > args[0].start_train and len( Buffer.buffer) > Buffer.min_size """ halite_agent.learn( observation=next_state, shipyard_simulated_step_memory=shipyard_simulated_step_memory, ship_simulated_step_memory=ship_simulated_step_memory, episode_number=episode_number, step_number=curr_step, terminal=done ) if (curr_step % SAVE_STEPS_FACTOR) == 0: ship_model_path_full = os.path.join(model_dir, 'ship_agent/') try: print(f'Saving model at {ship_model_path_full}, Step: {curr_step} - Episode: {episode_number}') ship_agent.save_weights( model_path=ship_model_path_full ) shipyard_agent.save_weights( model_path=os.path.join(model_dir, 'shipyard_agent/') ) except Exception as e: print('Failed to save weights') print(e) raise e scores = [vals[0] for vals in next_state.players] episode_results.append(scores) opponent = 0 if next_state.player else 1 player_score = next_state.players[next_state.player][0] player_won = player_score > next_state.players[opponent][0] if (curr_step % DISPLAY_RESULTS) == 0: print(f'Scores {scores}') with writer.as_default(): tf.summary.scalar(name="SCORE", data=player_score, step=curr_step) tf.summary.scalar(name="OPP_SCORE", data=next_state.players[opponent][0], step=curr_step) writer.flush() """ Update Model if conditions are met if ready_to_update_model: exp_state, exp_next_state, exp_reward, exp_action, exp_done = Buffer.sample_experiences( args[0].batch_size) agent.batch_train(exp_state, exp_next_state, exp_reward, exp_action, exp_done, target_network, args[0].Q_learning) if curr_step % args[0].update_target == 0: target_network.set_weights(agent.model.get_weights()) if curr_step % (SAVE_STEPS_FACTOR * args[0].update_target) == 0 and args[0].save_model: """ """ Save model if desired models.save_model( agent.model, model_dir + 'model_' + str(episode_number) + '_.h5' ) """ state = next_state # Resets state if done: print('Game done') with writer.as_default(): tf.summary.scalar(name="GAME_RESULT", data=player_won, step=curr_step) writer.flush() print('Wrote summary stats') with open(os.path.join(model_dir, 'results', f"{episode_number}_results.csv"), 'w') as f: pd.DataFrame(episode_results).to_csv(f) f.close() episode_number += 1 if args[0].mode != 'Train': episode_run = True state = env.reset() else: state, board = halite_env.reset() done = False
def play_episode(env: HaliteEnv, ship_agent: Agent, shipyard_agent: Agent, configuration, n_steps: int = 10, verbose: bool = True, training: bool = False, simulated_step_learning: bool = False, episode_number=0): episode_rewards = [] episode_actions = [] episode_scores = [] raw_observation: dict = env.reset()[0].__dict__ print('ep: {}'.format(episode_number)) done = False for step_num in range(n_steps): if done: board = Board(raw_observation, raw_configuration=configuration) print('Done') print(board) return episode_scores actions_for_step = {} # wont change step_observation = Observation(raw_observation) shipyard_temporary_initial_memory = {} ship_temporary_initial_memory = {} """ ==================================== ==================================== SHIPYARDS ==================================== ==================================== """ for shipyard_id, (pos) in step_observation.players[ step_observation.player][1].items(): # will change at each simulated step board = Board(raw_observation, raw_configuration=configuration) observation = Observation(raw_observation) # Select action converted_observation, is_occupied = env.wrap_observation_for_shipyard_agent( obs=observation, player=observation.player, spos=pos, uid=shipyard_id) state_vector = converted_observation.flatten() state_vector: np.ndarray = np.append(state_vector, is_occupied) action = shipyard_agent.get_action(state_vector, step=step_num, game=episode_number) halite_action = env.convert_shipyard_action_to_halite_enum( action, shipyard_id, observation) episode_actions.append(halite_action) # re-aligning action and halite action # TODO: should refactor if halite_action == ShipyardAction.SPAWN: action = 1 else: action = 0 if halite_action: actions_for_step[shipyard_id] = halite_action.name """ ============ Take Action ============ """ prev_obs = observation obs_next = env.update_observation_for_shipyard( board, shipyard_id, halite_action) reward = env.get_shipyard_reward( obs_next, env.wrap_observation_for_ship_agent( obs=obs_next, player=obs_next.player, spos=pos, # because shipyards can't move uid=shipyard_id), uid=shipyard_id, done=done) episode_rewards.append(reward) """ ============ Update Model ============ """ converted_next_obs, is_occupied_next = env.wrap_observation_for_shipyard_agent( obs_next, obs_next.player, spos=pos, uid=shipyard_id) next_state_vector = converted_next_obs.flatten() next_state_vector: np.ndarray = np.append(next_state_vector, is_occupied_next) if training: if simulated_step_learning: shipyard_agent.remember(state=state_vector, action=action, reward=reward, new_state=next_state_vector, done=done) shipyard_agent.learn(step_num=step_num, episode_num=episode_number) else: shipyard_temporary_initial_memory[shipyard_id] = { 'state': state_vector, 'action': action, 'pos': pos, 'is_occupied': is_occupied } if verbose and ((n_steps % 5) == 0): print( f"Step {step_num}: Action taken {action} for shipyard {shipyard_id}, " f"reward received {reward}") # update current observation with the simulated step ahead raw_observation = obs_next """ ==================================== ==================================== SHIPS ==================================== ==================================== """ for ship_id, (pos, halite) in step_observation.players[ step_observation.player][2].items(): # will change at each simulated step board = Board(raw_observation, raw_configuration=configuration) observation = Observation(raw_observation) """ ============ Take Action ============ """ converted_observation = env.wrap_observation_for_ship_agent( obs=Observation(board.observation), player=board.observation['player'], spos=int(pos), uid=ship_id) state_vector = converted_observation.flatten() action = ship_agent.get_action(state_vector, step=step_num, game=episode_number) episode_actions.append(action) halite_action = env.convert_ship_action_to_halite_enum( action, observation) if halite_action and halite_action.name == halite_action.CONVERT.name and \ observation.players[observation.player][0] < 500: # tried to convert without enough halite halite_action = None action = 5 if halite_action: actions_for_step[ship_id] = halite_action.name # Take action prev_obs = observation try: obs_next: Observation = env.update_observation_for_ship( board, ship_id, halite_action) except KeyError as e: print('Actions taken') print(actions_for_step) print('Current board and observation') print(board.ships.keys()) print(observation.players[observation.player]) print('Initial board and observation') print(step_observation.players[step_observation.player]) raise e # the ship may no longer exist... # ie it collided with an enemy ship or converted to a shipyard, we need to use the previous # for now we will use the new position IF it exists, otherwise just use the old one next_pos = obs_next.players[observation.player][2].get( ship_id, (None, None))[0] if not next_pos: next_pos = int(pos) reward = env.get_collector_ship_reward( obs_next, env.wrap_observation_for_ship_agent( obs=obs_next, player=obs_next.player, spos=pos, # because shipyards can't move uid=ship_id), ship_id, done=done) episode_rewards.append(reward) """ ============ Update Model ============ """ converted_next_obs = env.wrap_observation_for_ship_agent( obs=obs_next, player=obs_next.player, spos=next_pos, uid=ship_id) next_state_vector = converted_next_obs.flatten() # Update model if training: if simulated_step_learning: ship_agent.remember(state=state_vector, action=action, reward=reward, new_state=next_state_vector, done=done) ship_agent.learn(step_num=step_num, episode_num=episode_number) else: ship_temporary_initial_memory[ship_id] = { 'state': state_vector, 'action': action, 'pos': pos } action_string = halite_action.name if halite_action else 'None' if verbose and ((n_steps % 5) == 0): print( f"Step {step_num}: Action taken {action} | {action_string} for ship {ship_id}, " f"reward received {reward}") # update current observation with the simulated step ahead raw_observation = obs_next """ ================ ================ == Take Step ================ ================ """ # updates the env.observation step_results = env.step(actions=actions_for_step) print('Actions for step') print(actions_for_step) observation, game_reward, terminal = step_results if not simulated_step_learning: """ Here we are doing learning after the actual "step" has taken place. This means that the earlier a ship or shipyard has selected its move, the more unknowns and more "friendly reactions" that can occur afterwards. It would probably be very useful to include - remaining_ship_actions - remaining_shipyard_actions - and potentially the current epsilon value as a part of the state. """ player_halite = observation.players[observation.player][0] opponent_halites = [ item[0] for item in observation.players[observation.player:] ] best_opponent_halite = sorted(opponent_halites, reverse=True)[0] for ship_id, val in ship_temporary_initial_memory.items(): s = val['state'] a = val['action'] pos = val['pos'] converted_next_obs = env.wrap_observation_for_ship_agent( obs=Observation(observation), player=observation['player'], spos=int(pos), uid=ship_id) ship_reward = env.get_collector_ship_reward( observation=observation, converted_observation=converted_next_obs, uid=ship_id, done=done) next_state_vector = converted_next_obs.flatten() ship_agent.remember(state=s, action=a, reward=ship_reward, new_state=next_state_vector, done=done) ship_agent.learn(step_num=step_num, episode_num=episode_number) for shipyard_id, val in shipyard_temporary_initial_memory.items(): s = val['state'] a = val['action'] pos = val['pos'] is_occupied = val['is_occupied'] converted_next_obs, is_occupied_next = env.wrap_observation_for_shipyard_agent( obs=Observation(observation), player=observation['player'], spos=int(pos), uid=shipyard_id) print('For action: {}'.format(a)) shipyard_reward = env.get_shipyard_count_reward( observation=observation, converted_observation=converted_next_obs) next_state_vector = converted_next_obs.flatten() next_state_vector: np.ndarray = np.append( next_state_vector, is_occupied_next) shipyard_agent.remember(state=s, action=a, reward=shipyard_reward, new_state=next_state_vector, done=done) shipyard_agent.learn(step_num=step_num, episode_num=episode_number) episode_scores.append([item[0] for item in observation['players']]) raw_observation = observation return episode_scores
def play_step(self, raw_observation, step_observation): self.actions_for_step = {} self.shipyard_step_memory = {} self.ship_step_memory = {} self.step_number += 1 raw_observation, shipyard_simulated_step_memory = self.get_moves_for_all_shipyards( raw_observation=raw_observation, step_observation=step_observation) raw_observation, ship_simulated_step_memory = self.get_moves_for_all_ships( raw_observation=raw_observation, step_observation=step_observation) env = self.env actions_for_step = self.actions_for_step # updates the env.observation step_results = env.step(actions=actions_for_step) observation, game_reward, terminal = step_results if self.training: """ Here we are doing learning after the actual "step" has taken place. This means that the earlier a ship or shipyard has selected its move, the more unknowns and more "friendly reactions" that can occur afterwards. It would probably be very useful to include - remaining_ship_actions - remaining_shipyard_actions - and potentially the current epsilon value as a part of the state. """ for ship_id, val in ship_simulated_step_memory.items(): s = val['state'] a = val['action'] pos = val['pos'] if self.step_number >= self.ship_frame_stack_len: if self.ship_frame_stack_len > 1: multiframe_state = self.env.get_multiframe_ship_observation( ship_id) converted_obs = np.concatenate(multiframe_state, axis=0) s = converted_obs.flatten() converted_next_obs = env.wrap_observation_for_ship_agent( obs=Observation(observation), player=observation['player'], spos=int(pos), uid=ship_id) if self.ship_frame_stack_len > 1: multiframe_state = self.env.get_multiframe_ship_observation( ship_id) converted_next_obs = np.concatenate(multiframe_state, axis=0) ship_reward = env.get_ship_reward( observation=observation, converted_observation=converted_next_obs, uid=ship_id, done=terminal) next_state_vector = converted_next_obs.flatten() self.ship_agent.remember(state=s, action=a, reward=ship_reward, new_state=next_state_vector, done=terminal) self.ship_agent.learn(step_num=self.step_number, episode_num=self.episode_number) for shipyard_id, val in shipyard_simulated_step_memory.items(): s = val['state'] a = val['action'] pos = val['pos'] is_occupied = val['is_occupied'] converted_next_obs, is_occupied_next = env.wrap_observation_for_shipyard_agent( obs=Observation(observation), player=observation['player'], spos=int(pos), uid=shipyard_id) shipyard_reward = env.get_shipyard_reward( observation=observation, converted_observation=converted_next_obs, uid=shipyard_id, done=terminal) next_state_vector = converted_next_obs.flatten() next_state_vector: np.ndarray = np.append( next_state_vector, is_occupied_next) self.shipyard_agent.remember(state=s, action=a, reward=shipyard_reward, new_state=next_state_vector, done=terminal) self.shipyard_agent.learn(step_num=self.step_number, episode_num=self.episode_number) return terminal, [item[0] for item in observation.players]
def get_single_ship_move(self, ship_id, pos, step_observation, raw_observation, ship_simulated_step_memory): done = False board = Board(raw_observation, raw_configuration=self.configuration) observation = Observation(raw_observation) """ ============ Take Action ============ """ converted_observation = self.env.wrap_observation_for_ship_agent( obs=Observation(board.observation), player=board.observation['player'], spos=int(pos), uid=ship_id) state_vector = converted_observation.flatten() if self.ship_frame_stack_len > 1: multiframe_state = self.env.get_multiframe_ship_observation( ship_id) converted_obs = np.concatenate(multiframe_state, axis=0) state_vector = converted_obs.flatten() if len(self.env.get_multiframe_ship_observation( ship_id)) == self.ship_frame_stack_len: action = self.ship_agent.get_action(state_vector, step=self.step_number, game=self.episode_number) else: action = np.random.randint(0, 6) self.episode_actions.append(action) halite_action = self.env.convert_ship_action_to_halite_enum( action, observation) if halite_action and halite_action.name == halite_action.CONVERT.name and \ observation.players[observation.player][0] < 500: # tried to convert without enough halite halite_action = None action = 5 if halite_action: self.actions_for_step[ship_id] = halite_action.name # Take action try: obs_next: Observation = self.env.update_observation_for_ship( board, ship_id, halite_action) except KeyError as e: print('Actions taken') print(self.actions_for_step) print('Initial board and observation') print(step_observation.players[step_observation.player]) raise e # the ship may no longer exist... # ie it collided with an enemy ship or converted to a shipyard, we need to use the previous # for now we will use the new position IF it exists, otherwise just use the old one # next_pos = obs_next.players[observation.player][2].get(ship_id, (None, None))[0] """ ============ Prepare for Model Update ============ """ # Update model if self.training: ship_simulated_step_memory[ship_id] = { 'state': state_vector, 'action': action, 'pos': pos } action_string = halite_action.name if halite_action else 'None' if self.verbose and ((self.step_number % 10) == 0): print( f"Step {self.step_number}: Action taken {action} | {action_string} for ship {ship_id}, " f"reward received N/A | Player state {obs_next.players[observation.player]}" ) # update current observation with the simulated step ahead raw_observation = obs_next return raw_observation
def get_single_shipyard_move(self, shipyard_id, pos, step_observation, raw_observation, shipyard_temporary_initial_memory): configuration = self.configuration board = Board(raw_observation, raw_configuration=configuration) observation = Observation(raw_observation) verbose = self.verbose done = False # Select action converted_observation, is_occupied = self.env.wrap_observation_for_shipyard_agent( obs=observation, player=observation.player, spos=pos, uid=shipyard_id) state_vector = converted_observation.flatten() state_vector: np.ndarray = np.append(state_vector, is_occupied) action = self.shipyard_agent.get_action(state_vector, step=self.step_number, game=self.episode_number) halite_action = self.env.convert_shipyard_action_to_halite_enum( action, shipyard_id, observation) self.episode_actions.append(halite_action) """ ============ Take Action ============ """ obs_next = self.env.update_observation_for_shipyard( board, shipyard_id, halite_action) reward = self.env.get_shipyard_reward( obs_next, self.env.wrap_observation_for_ship_agent( obs=obs_next, player=obs_next.player, spos=pos, # because shipyards can't move uid=shipyard_id), uid=shipyard_id, done=done) self.episode_rewards.append(reward) """ ============ Prepare for Update Model ============ """ if self.training: shipyard_temporary_initial_memory[shipyard_id] = { 'state': state_vector, 'action': action, 'pos': pos, 'is_occupied': is_occupied } if verbose and ((self.step_number % 10) == 0): print( f"Step {self.step_number}: Action taken {action} for shipyard {shipyard_id}, " f"reward received {reward}") # update current observation with the simulated step ahead raw_observation = obs_next return raw_observation
def learn(self, observation, game_reward, terminal, ship_simulated_step_memory, shipyard_simulated_step_memory, step_number, episode_number): env = self.env """ Here we are doing learning after the actual "step" has taken place. This means that the earlier a ship or shipyard has selected its move, the more unknowns and more "friendly reactions" that can occur afterwards. It would probably be very useful to include - remaining_ship_actions - remaining_shipyard_actions - and potentially the current epsilon value as a part of the state. """ remaining = len(ship_simulated_step_memory) for ship_id, val in ship_simulated_step_memory.items(): s = val['state'] a = val['action'] pos = val['pos'] if step_number >= self.ship_frame_stack_len: if self.ship_frame_stack_len > 1: multiframe_state = self.env.get_multiframe_ship_observation( ship_id) converted_obs = np.concatenate(multiframe_state, axis=0) s = converted_obs.flatten() converted_next_obs = env.wrap_observation_for_ship_agent( obs=Observation(observation), player=observation['player'], remaining=remaining, turn=step_number, spos=int(pos), uid=ship_id) remaining -= 1 if self.ship_frame_stack_len > 1: multiframe_state = self.env.get_multiframe_ship_observation( ship_id) converted_next_obs = np.concatenate(multiframe_state, axis=0) ship_reward = env.get_ship_reward( observation=observation, converted_observation=converted_next_obs, uid=ship_id, done=terminal) if len(converted_next_obs) >= self.ship_frame_stack_len: next_state_vector = converted_next_obs.flatten() try: self.ship_agent.remember(state=s, action=a, reward=ship_reward, new_state=next_state_vector, done=terminal) self.ship_agent.learn(step_num=step_number, episode_num=episode_number) except Exception as e: print('shapes') print(s.shape) print(next_state_vector.shape) raise e for shipyard_id, val in shipyard_simulated_step_memory.items(): s = val['state'] a = val['action'] pos = val['pos'] converted_next_obs = env.wrap_observation_for_shipyard_agent( obs=Observation(observation), player=observation['player'], spos=int(pos), uid=shipyard_id) shipyard_reward = env.get_shipyard_reward( observation=observation, converted_observation=converted_next_obs, uid=shipyard_id, done=terminal) next_state_vector = converted_next_obs.flatten() self.shipyard_agent.remember(state=s, action=a, reward=shipyard_reward, new_state=next_state_vector, done=terminal) self.shipyard_agent.learn(step_num=step_number, episode_num=episode_number) self.reset_step_actions() return terminal, [item[0] for item in observation.players]
def get_single_shipyard_move( self, shipyard_id, pos, step_observation, raw_observation, shipyard_temporary_initial_memory, step_number=0, episode_number=0, ): configuration = self.configuration board = Board(raw_observation, raw_configuration=configuration) observation = Observation(raw_observation) verbose = self.verbose done = False # Select action converted_observation = self.shipyard_state_wrapper.get_basic_single_frame_complete_observation( obs=observation, player=observation.player, sy_pos=pos, uid=shipyard_id) state_vector = converted_observation player_state = step_observation.players[step_observation.player] if len(player_state[2]) == 0 and player_state[0] > 500: action = 1 else: action = self.shipyard_agent.get_action(state_vector, step=step_number, game=episode_number) halite_action = self.shipyard_state_wrapper.convert_action_to_enum( shipyard_id, observation, action) if halite_action: self.actions_for_step[shipyard_id] = halite_action.name """ ============ Take Action ============ """ obs_next = self.env.update_observation_for_shipyard( board, shipyard_id, halite_action) reward = self.env.get_shipyard_reward( obs_next, self.env.wrap_observation_for_shipyard_agent( obs=obs_next, player=obs_next.player, spos=pos, # because shipyards can't move uid=shipyard_id), uid=shipyard_id, done=done) """ ============ Prepare for Update Model ============ """ is_occupied = state_vector[-2] if self.training: shipyard_temporary_initial_memory[shipyard_id] = { 'state': state_vector, 'action': action, 'pos': pos, 'is_occupied': is_occupied } if verbose and ((step_number % 10) == 0): print( f"Step {step_number}: Action taken {action} for shipyard {shipyard_id}, " f"reward received {reward}") # update current observation with the simulated step ahead raw_observation = obs_next return raw_observation
def setUp(self): obs = Observation(test_state['observation']) self.obs = obs self.ship_state_wrapper = ShipStateWrapper(radius=2, max_frames=2, map_size=8)
from kaggle_environments import make from kaggle_environments.envs.halite.helpers import Board, ShipAction, ShipyardAction, Observation env = make("halite", configuration={"episodeSteps": 10, "size": 8}) env.run(["random", "random"]) observation = Observation(env.state[0]['observation']) print(env.configuration) print('HALITE') print(observation.halite) print('PLAYERS') print(observation.players)
def step(self, actions) -> Tuple[Observation, float, bool]: """Step forward in actual environment""" self.observation, reward, terminal, info = self.trainer.step(actions) self.observation = Observation(self.observation) terminal = terminal if terminal else 0 return self.observation, reward, terminal