def main(): env = OpenAIGym("P3DX-v0") agent = DQNAgent(states=dict(type='float', shape=(80, 80, 4)), actions=dict(type='int', num_actions=7), network=[ dict(type="conv2d", size=16, window=[8, 8], stride=4, activation="relu"), dict(type="conv2d", size=32, window=[4, 4], stride=2, activation="relu"), dict(type="flatten"), dict(type="dense", size=256) ], actions_exploration=dict(type="epsilon_decay", initial_epsilon=1.0, final_epsilon=0.1, timesteps=1000), memory=dict(type="replay", capacity=1000, include_next_states=True), update_mode=dict(unit="timesteps", batch_size=16, frequency=4), discount=0.99, entropy_regularization=None, double_q_model=True, optimizer=dict(type="adam", learning_rate=1e-4)) try: agent.restore_model(directory="modelo/", file="data-129235") print("Found data!") except Exception as e: print(e) print("Can't load data") print("Starting execution") state = env.reset() agent.reset() try: while True: # Get action - no exploration and no observing action = agent.act(state, deterministic=True, independent=True) print(action) # Execute action in the environment state, terminal_state, reward = env.execute(action) if terminal_state: raise KeyboardInterrupt except KeyboardInterrupt: print("Terminal state", terminal_state) state = env.reset() agent.reset()
class Reinforcement(ClientInterface.ClientInterface): def __init__(self, name, load_file=None, is_stats=False, file_stats=None, train_adversary_level=2, nb_batches=5000, nb_games_per_batch=2, layer_size=15, nb_layers=3): """ :param name: name of the IA/ :param load_file: path and name of the model to load (without any extension). :param is_stats: boolean which tells whether the statistics are enabled. :param file_stats: name of the file where the statistics are written. :param train_adversary_level: integer indicating the AI to train against (corresponds to level in AICreator). :param nb_batches: number of batches. A batch is a group of successive games on which the ratio (nb_won_games / nb_games_per_batch) is computed and saved in score.txt. :param nb_games_per_batch: number of games per batch. :param layer_size: size of a neural network layer. :param nb_layers: number of layers in the neural network. """ super().__init__(name, load_file) self.current_game_is_finish = None self.first_game = True # score self.score_self_old, self.score_self_new = 0, 0 self.score_other_old, self.score_other_new = 0, 0 self.file_scores = open('scores.txt', 'w') # AI parameters self.heuristics = [ Heuristic.line_transition, Heuristic.column_transition, Heuristic.hidden_empty_cells, Heuristic.wells, Heuristic.holes, Heuristic.highest_column, Heuristic.columns_heights ] state = State.State() heuristics_sizes = [ heuristic(state, state, None) for heuristic in self.heuristics ] self.nb_heuristics = len(flatten(heuristics_sizes)) print('self.nb_heuristics', heuristics_sizes) self.train_adversary_level = train_adversary_level # iteration self.nb_batches = nb_batches self.nb_games_per_batch = nb_games_per_batch self.iteration = 0 # neural network self.layer_size = layer_size self.nb_layers = nb_layers network_spec = [ dict(type='dense', size=self.layer_size, activation='relu') ] * self.nb_layers self.agent = DQNAgent(states_spec={ 'shape': (self.nb_heuristics + NOMBRE_DE_PIECES, ), 'type': 'float' }, actions_spec={ 'hor_move': { 'type': 'int', 'num_actions': 11 }, 'rotate': { 'type': 'int', 'num_actions': 4 }, 'choose': { 'type': 'int', 'num_actions': 3 } }, network_spec=network_spec) # loading of a saved model if load_file is not None: self.load(load_file) # stats self.is_stats = is_stats self.my_stats = None self.file_stats = file_stats self.pid_stats = None async def play(self, state): """ Associates an action to a state. Called by the server. :param state: dictionary containing information about the game, send by the server. :return: action to apply. """ # update all the scores (self.score_self_new, self.score_self_old, self.score_other_new, self.score_other_old) self.update_scores(state) # format the state to make it compatible with tensorforce state_formatted = self.format_state(state) if self.first_game: # at the first game and first call to function play, no action has been performed yet -> # nothing to observe self.first_game = False self.agent.reset() else: # pass observation to the agent terminal = False reward = (self.score_self_new - self.score_self_old) - ( self.score_other_new - self.score_other_old) self.agent.observe(terminal, reward) # select the action (exploitation or exploration) action = self.agent.act(state_formatted) # format the action to make it exploitable by the Tetris game action_to_apply = self.format_action(action, state) return action_to_apply # return {"hor_move": -2, "rotate": 1, "choose": state["pieces"][0]} def on_init_game(self, data): """ Called at the beginning of a game. :param data: dictionary containing information about the game, send by the server. """ print() print(self.iteration) self.my_id_in_game = data["ids_in_game"][0] def on_finished_game(self, data): """ Called at the end on a game. :param data: dictionary containing information about the game, send by the server. """ self.iteration += 1 self.current_game_is_finish = True # update all the scores self.update_scores(data) # pass observation to the agent terminal = True reward = (self.score_self_new - self.score_self_old) - ( self.score_other_new - self.score_other_old) self.agent.observe(terminal, reward) def update_scores(self, state): """ Updates the scores of the agent and of the the other player. :param state: dictionary containing information about the game. """ # update the old scores self.score_self_old, self.score_other_old = self.score_other_new, self.score_other_new # get the new scores self.score_self_new, self.score_other_new = self.format_score(state) @staticmethod def format_action(action, state): """ Formats the action returned by tensorforce so that it can be used in the play function. :param action: action returned by tensorforce (function act). :param state: dictionary containing information about the game, send by the server. :return: dictionary containing the action. """ # convert int32 (which is not serializable) to standard int action_to_apply = {key: int(value) for key, value in action.items()} action_to_apply['hor_move'] -= 5 # [0, 10] -> [-5, 5] action_to_apply['choose'] = state['pieces'][ action_to_apply['choose']] # index to letter return action_to_apply def evaluate_heuristics(self, heuristics, g_prec, g_next, action): """ Computes the current values of the heuristic. :param heuristics: list containing the heuristic functions. :param g_prec: previous state. :param g_next: current state. :param action: action which allows to go from g_prec to g_next. :return: flat list containing the heuristics values (flattening is necessary because some heuristics are lists). """ return flatten( [heuristic(g_prec, g_next, action) for heuristic in heuristics]) def format_state(self, state): """ Formats the state so that it can be used by tensorforce. :param state: dictionary containing information about the game, send by the server. :return: list containing the heuristics values. Represents the state. """ state_bis = State.State(state['grid']) heuristics_values = self.evaluate_heuristics(self.heuristics, None, state_bis, None) # selectable pieces as a one-shot vector pieces_one_hot = self.format_pieces(state['pieces']) # state used by tensorforce state_formatted = heuristics_values + pieces_one_hot print('{}, {}'.format(heuristics_values, pieces_one_hot)) return state_formatted def format_pieces(self, pieces): """ Formats the available pieces so that they can be used by tensorforce. :param pieces: 3-elements list containing letters representing pieces (no repetition). :return: 7-elements one-hot list containing 1 or 0. """ pieces_formatted = [0] * NOMBRE_DE_PIECES for piece in pieces: pieces_formatted[self.char_to_int(piece)] = 1 return pieces_formatted def format_score(self, state): """ Extracts the score of the AI and of the other player. :param state: dictionary containing information about the game, send by the server. :return: score_self, score_other. """ id_self = self.my_id_in_game id_other = (id_self + 1) % 2 score_self = state['score'][id_self] score_other = state['score'][id_other] return score_self, score_other @staticmethod def char_to_int(char): """ Converts a letter whose shape looks like a tetromino to a corresponding integer. :param char: 'O', 'I', 'L', 'T', 'S', 'Z' or 'J'. :return: integer from 0 to 6. """ lu_table = {'O': 0, 'I': 1, 'L': 2, 'T': 3, 'S': 4, 'Z': 5, 'J': 6} return lu_table[char] async def train(self): """ Triggers the training. """ await super().init_train() if self.is_stats: self.my_stats = Stats.Stats() self.pid_stats = await self.my_stats.observe() for _ in range(self.nb_batches): wins = 0 for _ in range(self.nb_games_per_batch): if self.is_stats: await super().new_game( players=[[self.my_client.pid, 1]], ias=[[self.train_adversary_level, 1]], viewers=[0, self.pid_stats]) else: await super().new_game( players=[[self.my_client.pid, 1]], ias=[[self.train_adversary_level, 1]], viewers=[0]) self.current_game_is_finish = False while not self.current_game_is_finish: await asyncio.sleep(0) self.current_game_is_finish = False # increment wins when a game is won wins += 1 if self.score_self_new > self.score_other_new else 0 # save the scores in a file self.file_scores.write('{}\n'.format(wins / self.nb_games_per_batch)) self.file_scores.flush() self.save() def save(self): """ Saves the current model in directory rein_learn_models as 3 files. """ #TODO: Dire si on a bien chargé # directory = os.path.join(os.getcwd(), 'rein_learn_models') time_str = time.strftime('%Y%m%d_%H%M%S') directory = os.path.join(os.getcwd(), 'rein_learn_models', 'agent_' + time_str) checkpoint = self.agent.save_model(directory=directory, append_timestep=True) print('directory: {}'.format(directory)) print('checkpoint: {}'.format(checkpoint)) def load(self, load_file): """ Loads a saved model. :param load_file: path and name of the model to load (without any extension). """ # load_file represent the file path (without any extension) directory = os.path.dirname(load_file) file = os.path.basename(load_file) self.agent.restore_model(directory=directory, file=file)
def main(): env = OpenAIGym("P3DX-v0") agent = DQNAgent( states=dict(type='float', shape=(80,80,4)), actions=dict(type='int', num_actions=7), network= [ dict( type="conv2d", size=16, window=[8,8], stride=4, activation="relu" ), dict( type="conv2d", size=32, window=[4,4], stride=2, activation="relu" ), dict( type="flatten" ), dict( type="dense", size=256 ) ], actions_exploration = dict( type="epsilon_decay", initial_epsilon=1.0, final_epsilon=0.1, timesteps=1000 ), memory=dict( type="replay", capacity=1000, include_next_states=True ), update_mode = dict( unit="timesteps", batch_size=16, frequency=4 ), discount = 0.99, entropy_regularization = None, double_q_model = True, optimizer = dict( type="adam", learning_rate=1e-4 ) ) try: agent.restore_model(directory="data/", file="data-117246") print("Found data!") except: print("Can't load data") SAVE_INTERVAL = 10 def episode_finished(r): #print(r.episode) if r.episode % SAVE_INTERVAL == 0: print("Finished episode {ep} after {ts} timesteps".format(ep=r.episode + 1, ts=r.timestep + 1)) print("Episode reward: {}".format(r.episode_rewards[-1])) print("Average of last {} rewards: {}\n".format(SAVE_INTERVAL, np.mean(r.episode_rewards[-SAVE_INTERVAL:]))) r.agent.save_model(directory="data/data", append_timestep=True) with open("reward_history.csv", "a") as csvfile: writer = csv.writer(csvfile) for reward in r.episode_rewards[-SAVE_INTERVAL:]: writer.writerow([r.episode, reward]) with open("episode_history.csv", "a") as csvfile: writer = csv.writer(csvfile) writer.writerow([r.episode, r.timestep]) ''' with open("individual_reward_history.csv", "a") as csvfile: writer = csv.writer(csvfile) writer.writerow([r.episode, r.episode_rewards[-1]]) ''' return True runner = Runner( agent = agent, # Agent object environment = env # Environment object ) max_episodes = 10000 max_timesteps = 50000000 runner.run(max_timesteps,max_episodes, episode_finished=episode_finished) runner.close()