def test_blogpost_introduction_runner(self):
        from tensorforce.environments.minimal_test import MinimalTest
        from tensorforce.agents import DQNAgent
        from tensorforce.execution import Runner

        environment = MinimalTest(specification=[('int', ())])

        network_spec = [
            dict(type='dense', size=32)
        ]

        agent = DQNAgent(
            states_spec=environment.states,
            actions_spec=environment.actions,
            network_spec=network_spec,
            memory=dict(
                type='replay',
                capacity=1000
            ),
            batch_size=8,
            first_update=100,
            target_sync_frequency=50
        )
        runner = Runner(agent=agent, environment=environment)

        def episode_finished(runner):
            if runner.episode % 100 == 0:
                print(sum(runner.episode_rewards[-100:]) / 100)
            return runner.episode < 100 \
                or not all(reward >= 1.0 for reward in runner.episode_rewards[-100:])

        # runner.run(episodes=1000, episode_finished=episode_finished)
        runner.run(episodes=10, episode_finished=episode_finished)  # Only 10 episodes for this test

        ### Code block: next
        agent = DQNAgent(
            states_spec=environment.states,
            actions_spec=environment.actions,
            network_spec=network_spec,
            memory=dict(
                type='replay',
                capacity=1000
            ),
            batch_size=8,
            first_update=100,
            target_sync_frequency=50
        )

        # max_episodes = 1000
        max_episodes = 10  # Only 10 episodes for this test
        max_timesteps = 2000

        episode = 0
        episode_rewards = list()

        while True:
            state = environment.reset()
            agent.reset()

            timestep = 0
            episode_reward = 0
            while True:
                action = agent.act(states=state)
                state, terminal, reward = environment.execute(actions=action)
                agent.observe(terminal=terminal, reward=reward)

                timestep += 1
                episode_reward += reward

                if terminal or timestep == max_timesteps:
                    break

            episode += 1
            episode_rewards.append(episode_reward)

            if all(reward >= 1.0 for reward in episode_rewards[-100:]) or episode == max_episodes:
                break

        agent.close()
        environment.close()
示例#2
0
    s = time.time()
    skip_steps = 8
    g.flip_player()
    for i in range(100000):
        state = g.reset()

        while not g.is_terminal():
            state = cv2.resize(state, (80, 80))
            # Perform Action
            action = agent.act(state)
            actions[action] += 1
            _, r, t, _ = g.step(action)


            # Add experience, agent automatically updates model according to batch size
            agent.observe(reward=r, terminal=t)
            g.flip_player()
            a2 = random.randint(0, g.get_action_space() - 1)
            _, _, t2, _ = g.step(a2)
            g.flip_player()

            for _ in range(skip_steps):
                g.update()
            # Re render and get state
            s1 = g.get_state()


            if t is True or t2 is True:
                if g.winner.id not in statistics:
                    statistics[g.winner.id] = 1
                else:
示例#3
0
    episode_plot = 1
    episode_print = 1
avgreward_per_episode = []
profits_per_episode = []
avg_ep_reward = 0
for i in range(num_episodes):
    agent.reset()
    states = notebook.reset()
    terminal = False
    step_reward = []
    print("Episode: " + str(i))
    while not terminal:
        action = agent.act(states=states)
        position = notebook.position
        states, terminal, reward = notebook.execute(actions=action)
        agent.observe(reward=reward, terminal=terminal)
        if (step_print):
            print("TS: " + str(notebook.time_step) + " action: " +
                  str(action) + " position: " + str(position) + " reward: " +
                  str(reward) + " profit: " + str(notebook.profit) +
                  " curr_price: " + str(notebook.curr_price) + " curr_cash: " +
                  str(notebook.curr_cash))
        step_reward.append(reward)
    final_profit = notebook.curr_cash - notebook.starting_cash
    if (episode_print):
        print(" FinalCash: " + str(notebook.curr_cash) + " Profit: " +
              str(notebook.curr_cash - notebook.starting_cash) +
              " MeanReward: " + str(np.array(step_reward).mean()))
    profits_per_episode.append(final_profit)
    if (step_plot):
        plt.figure(figsize=(20, 12))
示例#4
0
        if infrastructure.attempts < infrastructure.peers:
            agent_ppo.observe(reward=reward, terminal=False)
        else:
            agent_ppo.observe(reward=reward, terminal=True)

        rl_ppo.append(reward)

        #dqn agent
        action = agent_dqn.act(state)
        action = action.values()

        reward = infrastructure.monkey(action)

        if infrastructure.attempts < infrastructure.peers:
            agent_dqn.observe(reward=reward, terminal=False)
        else:
            agent_dqn.observe(reward=reward, terminal=True)

        rl_dqn.append(reward)

        #trpo agent
        action = agent_vpg.act(state)
        action = action.values()

        reward = infrastructure.monkey(action)

        if infrastructure.attempts < infrastructure.peers:
            agent_vpg.observe(reward=reward, terminal=False)
        else:
            agent_vpg.observe(reward=reward, terminal=True)
示例#5
0
        state = dict()
        state['image'] = observation,
        state['previous_act'] = GazeboMaze.vel_cmd,
        state['relative_pos'] = GazeboMaze.p,
        # state = dict(image=observation, previous_act=GazeboMaze.vel_cmd, relative_pos=GazeboMaze.p)
        # print(state)

        # Query the agent for its action decision
        action = agent.act(state)
        print(action)
        # Execute the decision and retrieve the current information
        observation, terminal, reward = GazeboMaze.execute(action)
        observation = observation / 255.0  # normalize
        # print(reward)
        # Pass feedback about performance (and termination) to the agent
        agent.observe(terminal=terminal, reward=reward)
        timestep += 1
        episode_reward += reward
        if terminal or timestep == max_timesteps:
            success = GazeboMaze.success
            break

    episode += 1
    total_timestep += timestep
    # avg_reward = float(episode_reward)/timestep
    successes.append(success)
    episode_rewards.append([episode_reward, timestep, success])

    # if total_timestep > 100000:
    #     print('{}th episode reward: {}'.format(episode, episode_reward))
示例#6
0
class Reinforcement(ClientInterface.ClientInterface):
    def __init__(self,
                 name,
                 load_file=None,
                 is_stats=False,
                 file_stats=None,
                 train_adversary_level=2,
                 nb_batches=5000,
                 nb_games_per_batch=2,
                 layer_size=15,
                 nb_layers=3):
        """
        :param name: name of the IA/
        :param load_file: path and name of the model to load (without any extension).
        :param is_stats: boolean which tells whether the statistics are enabled.
        :param file_stats: name of the file where the statistics are written.
        :param train_adversary_level: integer indicating the AI to train against (corresponds to level in AICreator).
        :param nb_batches: number of batches. A batch is a group of successive games on which the ratio
        (nb_won_games / nb_games_per_batch) is computed and saved in score.txt.
        :param nb_games_per_batch: number of games per batch.
        :param layer_size: size of a neural network layer.
        :param nb_layers: number of layers in the neural network.
        """

        super().__init__(name, load_file)

        self.current_game_is_finish = None
        self.first_game = True

        # score
        self.score_self_old, self.score_self_new = 0, 0
        self.score_other_old, self.score_other_new = 0, 0
        self.file_scores = open('scores.txt', 'w')

        # AI parameters
        self.heuristics = [
            Heuristic.line_transition, Heuristic.column_transition,
            Heuristic.hidden_empty_cells, Heuristic.wells, Heuristic.holes,
            Heuristic.highest_column, Heuristic.columns_heights
        ]

        state = State.State()
        heuristics_sizes = [
            heuristic(state, state, None) for heuristic in self.heuristics
        ]
        self.nb_heuristics = len(flatten(heuristics_sizes))
        print('self.nb_heuristics', heuristics_sizes)
        self.train_adversary_level = train_adversary_level

        # iteration
        self.nb_batches = nb_batches
        self.nb_games_per_batch = nb_games_per_batch
        self.iteration = 0

        # neural network
        self.layer_size = layer_size
        self.nb_layers = nb_layers
        network_spec = [
            dict(type='dense', size=self.layer_size, activation='relu')
        ] * self.nb_layers

        self.agent = DQNAgent(states_spec={
            'shape': (self.nb_heuristics + NOMBRE_DE_PIECES, ),
            'type': 'float'
        },
                              actions_spec={
                                  'hor_move': {
                                      'type': 'int',
                                      'num_actions': 11
                                  },
                                  'rotate': {
                                      'type': 'int',
                                      'num_actions': 4
                                  },
                                  'choose': {
                                      'type': 'int',
                                      'num_actions': 3
                                  }
                              },
                              network_spec=network_spec)

        # loading of a saved model
        if load_file is not None:
            self.load(load_file)

        # stats
        self.is_stats = is_stats
        self.my_stats = None
        self.file_stats = file_stats
        self.pid_stats = None

    async def play(self, state):
        """
        Associates an action to a state. Called by the server.
        :param state: dictionary containing information about the game, send by the server.
        :return: action to apply.
        """

        # update all the scores (self.score_self_new, self.score_self_old, self.score_other_new, self.score_other_old)
        self.update_scores(state)

        # format the state to make it compatible with tensorforce
        state_formatted = self.format_state(state)

        if self.first_game:  # at the first game and first call to function play, no action has been performed yet ->
            # nothing to observe
            self.first_game = False
            self.agent.reset()
        else:
            # pass observation to the agent
            terminal = False
            reward = (self.score_self_new - self.score_self_old) - (
                self.score_other_new - self.score_other_old)
            self.agent.observe(terminal, reward)

        # select the action (exploitation or exploration)
        action = self.agent.act(state_formatted)

        # format the action to make it exploitable by the Tetris game
        action_to_apply = self.format_action(action, state)

        return action_to_apply
        # return {"hor_move": -2, "rotate": 1, "choose": state["pieces"][0]}

    def on_init_game(self, data):
        """
        Called at the beginning of a game.
        :param data: dictionary containing information about the game, send by the server.
        """

        print()
        print(self.iteration)

        self.my_id_in_game = data["ids_in_game"][0]

    def on_finished_game(self, data):
        """
        Called at the end on a game.
        :param data: dictionary containing information about the game, send by the server.
        """

        self.iteration += 1

        self.current_game_is_finish = True

        # update all the scores
        self.update_scores(data)

        # pass observation to the agent
        terminal = True
        reward = (self.score_self_new - self.score_self_old) - (
            self.score_other_new - self.score_other_old)
        self.agent.observe(terminal, reward)

    def update_scores(self, state):
        """
        Updates the scores of the agent and of the the other player.
        :param state: dictionary containing information about the game.
        """

        # update the old scores
        self.score_self_old, self.score_other_old = self.score_other_new, self.score_other_new

        # get the new scores
        self.score_self_new, self.score_other_new = self.format_score(state)

    @staticmethod
    def format_action(action, state):
        """
        Formats the action returned by tensorforce so that it can be used in the play function.
        :param action: action returned by tensorforce (function act).
        :param state: dictionary containing information about the game, send by the server.
        :return: dictionary containing the action.
        """

        # convert int32 (which is not serializable) to standard int
        action_to_apply = {key: int(value) for key, value in action.items()}

        action_to_apply['hor_move'] -= 5  # [0, 10] -> [-5, 5]
        action_to_apply['choose'] = state['pieces'][
            action_to_apply['choose']]  # index to letter

        return action_to_apply

    def evaluate_heuristics(self, heuristics, g_prec, g_next, action):
        """
        Computes the current values of the heuristic.
        :param heuristics: list containing the heuristic functions.
        :param g_prec: previous state.
        :param g_next: current state.
        :param action: action which allows to go from g_prec to g_next.
        :return: flat list containing the heuristics values (flattening is necessary because some heuristics are lists).
        """

        return flatten(
            [heuristic(g_prec, g_next, action) for heuristic in heuristics])

    def format_state(self, state):
        """
        Formats the state so that it can be used by tensorforce.
        :param state: dictionary containing information about the game, send by the server.
        :return: list containing the heuristics values. Represents the state.
        """

        state_bis = State.State(state['grid'])
        heuristics_values = self.evaluate_heuristics(self.heuristics, None,
                                                     state_bis, None)

        # selectable pieces as a one-shot vector
        pieces_one_hot = self.format_pieces(state['pieces'])

        # state used by tensorforce
        state_formatted = heuristics_values + pieces_one_hot

        print('{}, {}'.format(heuristics_values, pieces_one_hot))
        return state_formatted

    def format_pieces(self, pieces):
        """
        Formats the available pieces so that they can be used by tensorforce.
        :param pieces: 3-elements list containing letters representing pieces (no repetition).
        :return: 7-elements one-hot list containing 1 or 0.
        """

        pieces_formatted = [0] * NOMBRE_DE_PIECES

        for piece in pieces:
            pieces_formatted[self.char_to_int(piece)] = 1

        return pieces_formatted

    def format_score(self, state):
        """
        Extracts the score of the AI and of the other player.
        :param state: dictionary containing information about the game, send by the server.
        :return: score_self, score_other.
        """

        id_self = self.my_id_in_game
        id_other = (id_self + 1) % 2
        score_self = state['score'][id_self]
        score_other = state['score'][id_other]

        return score_self, score_other

    @staticmethod
    def char_to_int(char):
        """
        Converts a letter whose shape looks like a tetromino to a corresponding integer.
        :param char: 'O', 'I', 'L', 'T', 'S', 'Z' or 'J'.
        :return: integer from 0 to 6.
        """

        lu_table = {'O': 0, 'I': 1, 'L': 2, 'T': 3, 'S': 4, 'Z': 5, 'J': 6}
        return lu_table[char]

    async def train(self):
        """
        Triggers the training.
        """

        await super().init_train()
        if self.is_stats:
            self.my_stats = Stats.Stats()
            self.pid_stats = await self.my_stats.observe()

        for _ in range(self.nb_batches):
            wins = 0
            for _ in range(self.nb_games_per_batch):
                if self.is_stats:
                    await super().new_game(
                        players=[[self.my_client.pid, 1]],
                        ias=[[self.train_adversary_level, 1]],
                        viewers=[0, self.pid_stats])
                else:
                    await super().new_game(
                        players=[[self.my_client.pid, 1]],
                        ias=[[self.train_adversary_level, 1]],
                        viewers=[0])

                self.current_game_is_finish = False

                while not self.current_game_is_finish:
                    await asyncio.sleep(0)

                self.current_game_is_finish = False

                # increment wins when a game is won
                wins += 1 if self.score_self_new > self.score_other_new else 0

            # save the scores in a file
            self.file_scores.write('{}\n'.format(wins /
                                                 self.nb_games_per_batch))
            self.file_scores.flush()

        self.save()

    def save(self):
        """
        Saves the current model in directory rein_learn_models as 3 files.
        """

        #TODO: Dire si on a bien chargé
        # directory = os.path.join(os.getcwd(), 'rein_learn_models')
        time_str = time.strftime('%Y%m%d_%H%M%S')
        directory = os.path.join(os.getcwd(), 'rein_learn_models',
                                 'agent_' + time_str)
        checkpoint = self.agent.save_model(directory=directory,
                                           append_timestep=True)
        print('directory: {}'.format(directory))
        print('checkpoint: {}'.format(checkpoint))

    def load(self, load_file):
        """
        Loads a saved model.
        :param load_file: path and name of the model to load (without any extension).
        """

        # load_file represent the file path (without any extension)
        directory = os.path.dirname(load_file)
        file = os.path.basename(load_file)

        self.agent.restore_model(directory=directory, file=file)
示例#7
0
    def test_blogpost_introduction_runner(self):
        from tensorforce.config import Configuration
        from tensorforce.core.networks import layered_network_builder
        from tensorforce.environments.minimal_test import MinimalTest
        from tensorforce.agents import DQNAgent
        from tensorforce.execution import Runner

        environment = MinimalTest(definition=False)

        network_config = [
            dict(type='dense', size=32)
        ]
        agent_config = Configuration(
            batch_size=8,
            learning_rate=0.001,
            memory_capacity=800,
            first_update=80,
            repeat_update=4,
            target_update_frequency=20,
            states=environment.states,
            actions=environment.actions,
            network=layered_network_builder(network_config)
        )

        agent = DQNAgent(config=agent_config)
        runner = Runner(agent=agent, environment=environment)

        def episode_finished(runner):
            if runner.episode % 100 == 0:
                print(sum(runner.episode_rewards[-100:]) / 100)
            return runner.episode < 100 \
                   or not all(reward >= 1.0 for reward in runner.episode_rewards[-100:])

        # runner.run(episodes=1000, episode_finished=episode_finished)
        runner.run(episodes=10, episode_finished=episode_finished)  # Only 10 episodes for this test

        ### Code block: next

        # max_episodes = 1000
        max_episodes = 10  # Only 10 episodes for this test
        max_timesteps = 2000

        episode = 0
        episode_rewards = list()

        while True:
            state = environment.reset()
            agent.reset()

            timestep = 0
            episode_reward = 0
            while True:
                action = agent.act(state=state)
                state, reward, terminal = environment.execute(action=action)
                agent.observe(reward=reward, terminal=terminal)

                timestep += 1
                episode_reward += reward

                if terminal or timestep == max_timesteps:
                    break

            episode += 1
            episode_rewards.append(episode_reward)

            if all(reward >= 1.0 for reward in episode_rewards[-100:]) \
                    or episode == max_episodes:
                break