Exemplo n.º 1
0
class DeepQAgent(object):
    """
    Implementation of Deep Q Neural Network agent like in:
        Nature 518. "Human-level control through deep reinforcement learning" (Mnih & al. 2015)
    """
    def __init__(self, input_shape, nb_actions,
                 gamma=0.95, explorer=LinearEpsilonAnnealingExplorer(1, 0.1, 100000),
                 learning_rate=0.01, momentum=0.8, minibatch_size=16,
                 memory_size=15000, train_after=100, train_interval=100, target_update_interval=500,
                 monitor=True):
        self.input_shape = input_shape
        self.nb_actions = nb_actions
        self.gamma = gamma

        self._train_after = train_after
        self._train_interval = train_interval
        self._target_update_interval = target_update_interval

        self._explorer = explorer
        self._minibatch_size = minibatch_size
        self._history = History(input_shape)
        self._memory = ReplayMemory(memory_size, input_shape[1:], 4)
        self._num_actions_taken = 0
        self._num_trains = 0

        # Metrics accumulator
        self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], []

        '''
        # Action Value model (used by agent to interact with the environment)
        with default_options(activation=relu, init=he_uniform()):
            self._action_value_net = Sequential([
                Convolution2D((8, 8), 16, strides=4),
                Convolution2D((4, 4), 32, strides=2),
                Convolution2D((3, 3), 32, strides=1),
                Dense(256, init=he_uniform(scale=0.01)),
                Dense(nb_actions, activation=None, init=he_uniform(scale=0.01))
            ])
        ''' 
        with default_options(activation=relu, init=he_uniform()):
            self._action_value_net = Sequential([
                Dense(7, init=he_uniform(scale=0.01)),
                Dense(8, init=he_uniform(scale=0.01)),
                #Dense(16, init=he_uniform(scale=0.01)),
                #Dense(32, init=he_uniform(scale=0.01)),
                Dense(nb_actions, activation=None, init=he_uniform(scale=0.01))
            ])
        
        self._action_value_net.update_signature(Tensor[input_shape])

        # Target model used to compute the target Q-values in training, updated
        # less frequently for increased stability.
        self._target_net = self._action_value_net.clone(CloneMethod.freeze)

        # Function computing Q-values targets as part of the computation graph
        @Function
        @Signature(post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()])
        def compute_q_targets(post_states, rewards, terminals):
            return element_select(
                terminals,
                rewards,
                gamma * reduce_max(self._target_net(post_states), axis=0) + rewards,
            )

        # Define the loss, using Huber Loss (more robust to outliers)
        @Function
        @Signature(pre_states=Tensor[input_shape], actions=Tensor[nb_actions],
                   post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()])
        def criterion(pre_states, actions, post_states, rewards, terminals):
            # Compute the q_targets
            q_targets = compute_q_targets(post_states, rewards, terminals)

            # actions is a 1-hot encoding of the action done by the agent
            q_acted = reduce_sum(self._action_value_net(pre_states) * actions, axis=0)

            # Define training criterion as the Huber Loss function
            return huber_loss(q_targets, q_acted, 1.0)

        # Adam based SGD
        lr_schedule = learning_rate_schedule(learning_rate, UnitType.minibatch)
        m_schedule = momentum_schedule(momentum)
        vm_schedule = momentum_schedule(0.999)
        l_sgd = adam(self._action_value_net.parameters, lr_schedule,
                     momentum=m_schedule, variance_momentum=vm_schedule)

        self._metrics_writer = TensorBoardProgressWriter(freq=1, log_dir='metrics', model=criterion) if monitor else None
        self._learner = l_sgd
        self._trainer = Trainer(criterion, (criterion, None), l_sgd, self._metrics_writer)

        #self._trainer.restore_from_checkpoint('models/oldmodels/model800000')

    def act(self, state):
        """ This allows the agent to select the next action to perform in regard of the current state of the environment.
        It follows the terminology used in the Nature paper.

        Attributes:
            state (Tensor[input_shape]): The current environment state

        Returns: Int >= 0 : Next action to do
        """
        # Append the state to the short term memory (ie. History)
        self._history.append(state)

        # If policy requires agent to explore, sample random action
        if self._explorer.is_exploring(self._num_actions_taken):
            action = self._explorer(self.nb_actions)
        else:
            # Use the network to output the best action
            env_with_history = self._history.value
            q_values = self._action_value_net.eval(
                # Append batch axis with only one sample to evaluate
                env_with_history.reshape((1,) + env_with_history.shape)
            )

            self._episode_q_means.append(np.mean(q_values))
            self._episode_q_stddev.append(np.std(q_values))

            # Return the value maximizing the expected reward
            action = q_values.argmax()

        # Keep track of interval action counter
        self._num_actions_taken += 1
        #print(self._num_actions_taken)
        return action

    def observe(self, old_state, action, reward, done):
        """ This allows the agent to observe the output of doing the action it selected through act() on the old_state

        Attributes:
            old_state (Tensor[input_shape]): Previous environment state
            action (int): Action done by the agent
            reward (float): Reward for doing this action in the old_state environment
            done (bool): Indicate if the action has terminated the environment
        """
        self._episode_rewards.append(reward)

        # If done, reset short term memory (ie. History)
        if done:
            # Plot the metrics through Tensorboard and reset buffers
            if self._metrics_writer is not None:
                self._plot_metrics()
            self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], []

            # Reset the short term memory
            self._history.reset()

        # Append to long term memory
        self._memory.append(old_state, action, reward, done)

    def train(self):
        """ This allows the agent to train itself to better understand the environment dynamics.
        The agent will compute the expected reward for the state(t+1)
        and update the expected reward at step t according to this.

        The target expectation is computed through the Target Network, which is a more stable version
        of the Action Value Network for increasing training stability.

        The Target Network is a frozen copy of the Action Value Network updated as regular intervals.
        """

        agent_step = self._num_actions_taken

        if agent_step >= self._train_after:
            #if (agent_step % self._train_interval) == 0:
            print('\nTraining minibatch\n')
            client.setCarControls(zero_controls)
            pre_states, actions, post_states, rewards, terminals = self._memory.minibatch(self._minibatch_size)
            self._trainer.train_minibatch(
                self._trainer.loss_function.argument_map(
                    pre_states=pre_states,
                    actions=Value.one_hot(actions.reshape(-1, 1).tolist(), self.nb_actions),
                    post_states=post_states,
                    rewards=rewards,
                    terminals=terminals
                )
            )
            self._num_trains += 1
            # Update the Target Network if needed
            if self._num_trains % 20 == 0:
                print('updating network')
                self._target_net = self._action_value_net.clone(CloneMethod.freeze)
                filename = dirname+"\model%d" % agent_step
                self._trainer.save_checkpoint(filename)
    
    def _plot_metrics(self):
        """Plot current buffers accumulated values to visualize agent learning
        """
        if len(self._episode_q_means) > 0:
            mean_q = np.asscalar(np.mean(self._episode_q_means))
            self._metrics_writer.write_value('Mean Q per ep.', mean_q, self._num_actions_taken)

        if len(self._episode_q_stddev) > 0:
            std_q = np.asscalar(np.mean(self._episode_q_stddev))
            self._metrics_writer.write_value('Mean Std Q per ep.', std_q, self._num_actions_taken)

        self._metrics_writer.write_value('Sum rewards per ep.', sum(self._episode_rewards), self._num_actions_taken)
Exemplo n.º 2
0
class DeepQAgent(object):
    """
    Implementation of Deep Q Neural Network agent like in:
        Nature 518. "Human-level control through deep reinforcement learning" (Mnih & al. 2015)
    """
    def __init__(self, input_shape, nb_actions,
                 gamma=0.99, explorer=LinearEpsilonAnnealingExplorer(1, 0.1, 1000000),
                 learning_rate=0.00025, momentum=0.95, minibatch_size=32,
                 memory_size=500000, train_after=10000, train_interval=4, target_update_interval=10000,
                 monitor=True):
        self.input_shape = input_shape
        self.nb_actions = nb_actions
        self.gamma = gamma

        self._train_after = train_after
        self._train_interval = train_interval
        self._target_update_interval = target_update_interval

        self._explorer = explorer
        self._minibatch_size = minibatch_size
        self._history = History(input_shape)
        self._memory = ReplayMemory(memory_size, input_shape[1:], 4)
        self._num_actions_taken = 0

        # Metrics accumulator
        self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], []

        # Action Value model (used by agent to interact with the environment)
        with default_options(activation=relu, init=he_uniform()):
            self._action_value_net = Sequential([
                Convolution2D((8, 8), 16, strides=4),
                Convolution2D((4, 4), 32, strides=2),
                Convolution2D((3, 3), 32, strides=1),
                Dense(256, init=he_uniform(scale=0.01)),
                Dense(nb_actions, activation=None, init=he_uniform(scale=0.01))
            ])
        self._action_value_net.update_signature(Tensor[input_shape])

        # Target model used to compute the target Q-values in training, updated
        # less frequently for increased stability.
        self._target_net = self._action_value_net.clone(CloneMethod.freeze)

        # Function computing Q-values targets as part of the computation graph
        @Function
        @Signature(post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()])
        def compute_q_targets(post_states, rewards, terminals):
            return element_select(
                terminals,
                rewards,
                gamma * reduce_max(self._target_net(post_states), axis=0) + rewards,
            )

        # Define the loss, using Huber Loss (more robust to outliers)
        @Function
        @Signature(pre_states=Tensor[input_shape], actions=Tensor[nb_actions],
                   post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()])
        def criterion(pre_states, actions, post_states, rewards, terminals):
            # Compute the q_targets
            q_targets = compute_q_targets(post_states, rewards, terminals)

            # actions is a 1-hot encoding of the action done by the agent
            q_acted = reduce_sum(self._action_value_net(pre_states) * actions, axis=0)

            # Define training criterion as the Huber Loss function
            return huber_loss(q_targets, q_acted, 1.0)

        # Adam based SGD
        lr_schedule = learning_rate_schedule(learning_rate, UnitType.minibatch)
        m_schedule = momentum_schedule(momentum)
        vm_schedule = momentum_schedule(0.999)
        l_sgd = adam(self._action_value_net.parameters, lr_schedule,
                     momentum=m_schedule, variance_momentum=vm_schedule)

        self._metrics_writer = TensorBoardProgressWriter(freq=1, log_dir='metrics', model=criterion) if monitor else None
        self._learner = l_sgd
        self._trainer = Trainer(criterion, (criterion, None), l_sgd, self._metrics_writer)

    def act(self, state):
        """ This allows the agent to select the next action to perform in regard of the current state of the environment.
        It follows the terminology used in the Nature paper.

        Attributes:
            state (Tensor[input_shape]): The current environment state

        Returns: Int >= 0 : Next action to do
        """
        # Append the state to the short term memory (ie. History)
        self._history.append(state)

        # If policy requires agent to explore, sample random action
        if self._explorer.is_exploring(self._num_actions_taken):
            action = self._explorer(self.nb_actions)
        else:
            # Use the network to output the best action
            env_with_history = self._history.value
            q_values = self._action_value_net.eval(
                # Append batch axis with only one sample to evaluate
                env_with_history.reshape((1,) + env_with_history.shape)
            )

            self._episode_q_means.append(np.mean(q_values))
            self._episode_q_stddev.append(np.std(q_values))

            # Return the value maximizing the expected reward
            action = q_values.argmax()

        # Keep track of interval action counter
        self._num_actions_taken += 1
        return action

    def observe(self, old_state, action, reward, done):
        """ This allows the agent to observe the output of doing the action it selected through act() on the old_state

        Attributes:
            old_state (Tensor[input_shape]): Previous environment state
            action (int): Action done by the agent
            reward (float): Reward for doing this action in the old_state environment
            done (bool): Indicate if the action has terminated the environment
        """
        self._episode_rewards.append(reward)

        # If done, reset short term memory (ie. History)
        if done:
            # Plot the metrics through Tensorboard and reset buffers
            if self._metrics_writer is not None:
                self._plot_metrics()
            self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], []

            # Reset the short term memory
            self._history.reset()

        # Append to long term memory
        self._memory.append(old_state, action, reward, done)

    def train(self):
        """ This allows the agent to train itself to better understand the environment dynamics.
        The agent will compute the expected reward for the state(t+1)
        and update the expected reward at step t according to this.

        The target expectation is computed through the Target Network, which is a more stable version
        of the Action Value Network for increasing training stability.

        The Target Network is a frozen copy of the Action Value Network updated as regular intervals.
        """

        agent_step = self._num_actions_taken

        if agent_step >= self._train_after:
            if (agent_step % self._train_interval) == 0:
                pre_states, actions, post_states, rewards, terminals = self._memory.minibatch(self._minibatch_size)

                self._trainer.train_minibatch(
                    self._trainer.loss_function.argument_map(
                        pre_states=pre_states,
                        actions=Value.one_hot(actions.reshape(-1, 1).tolist(), self.nb_actions),
                        post_states=post_states,
                        rewards=rewards,
                        terminals=terminals
                    )
                )

                # Update the Target Network if needed
                if (agent_step % self._target_update_interval) == 0:
                    self._target_net = self._action_value_net.clone(CloneMethod.freeze)
                    filename = "models\model%d" % agent_step
                    self._trainer.save_checkpoint(filename)

    def _plot_metrics(self):
        """Plot current buffers accumulated values to visualize agent learning
        """
        if len(self._episode_q_means) > 0:
            mean_q = np.asscalar(np.mean(self._episode_q_means))
            self._metrics_writer.write_value('Mean Q per ep.', mean_q, self._num_actions_taken)

        if len(self._episode_q_stddev) > 0:
            std_q = np.asscalar(np.mean(self._episode_q_stddev))
            self._metrics_writer.write_value('Mean Std Q per ep.', std_q, self._num_actions_taken)

        self._metrics_writer.write_value('Sum rewards per ep.', sum(self._episode_rewards), self._num_actions_taken)
Exemplo n.º 3
0
class DQAgent(object):
    """docstring for DQAgent"""                                                             ############should modify @!@!
    def __init__(self, input_shape, nb_actions,
                 gamma=0.99, explorer=LinearEpsilonAnnealingExplorer(1, 0.1, 1000000),
                 learning_rate=0.00025, momentum=0.95, minibatch_size=32,
                 memory_size=500000, train_after=10000, train_interval=4,
                 target_update_interval=10000, monitor=True):
        self.input_shape = input_shape
        self.nb_actions = nb_actions
        self.gamma = gamma
        self._train_after = train_after
        self._train_interval = train_interval
        self._target_update_interval = target_update_interval
        self._explorer = explorer
        self._minibatch_size = minibatch_size
        self._history = History(input_shape)
        self._memory = RepMem(memory_size, input_shape[1:], 4)
        self._num_actions_taken = 0
        self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], []

        with default_options(activation=relu, init=he_uniform()):
            self._action_value_net = Sequential([
                Dense(input_shape, init=he_uniform(scale=0.01)),
                Dense(input_shape),
                Dense(nb_actions, activation=None, init=he_uniform(scale=0.01))])

        self._action_value_net.update_signature(Tensor[input_shape])

        self._target_net = self._action_value_net.clone(CloneMethod.freeze)


        @Function
        @Signature(post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()])
        def compute_q_targets(post_states, rewards, terminals):
            return element_select(
                terminals,
                rewards,
                gamma * reduce_max(self._target_net(post_states), axis=0) + rewards,
            )

        @Function
        @Signature(pre_states=Tensor[input_shape], actions=Tensor[nb_actions],
                   post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()])
        def criterion(pre_states, actions, post_states, rewards, terminals):
            q_targets = compute_q_targets(post_states, rewards, terminals)

            q_acted = reduce_sum(self._action_value_net(pre_states) * actions, axis=0)

            return huber_loss(q_targets, q_acted, 1.0)

        lr_schedule = learning_rate_schedule(learning_rate, UnitType.minibatch)
        m_schedule = momentum_schedule(momentum)
        vm_schedule = momentum_schedule(0.999)
        l_sgd = adam(self._action_value_net.parameters, lr_schedule,
                     momentum=m_schedule, variance_momentum=vm_schedule)

        self._metrics_writer = TensorBoardProgressWriter(freq=1, log_dir='metrics', model=criterion) if monitor else None
        self._learner = l_sgd
        self._trainer = Trainer(criterion, (criterion, None), l_sgd, self._metrics_writer)

    def act(self, state):
        self._history.append(state)

        if self._explorer.is_exploring(self._num_actions_taken):
            action = self._explorer(self.nb_actions)
        else:
            env_with_history = self._history.value
            q_values = self._action_value_net.eval(
                env_with_history.reshape((1,) + env_with_history.shape)
            )

            self._episode_q_means.append(np.mean(q_values))
            self._episode_q_stddev.append(np.std(q_values))
            action = q_values.argmax()

        self._num_actions_taken += 1
        return action

    def observe(self, old_state, action, reward, done):
        self._episode_rewards.append(reward)

        if done:
            if self._metrics_writer is not None:
                self._plot_metrics()
            self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], []

            self._history.reset()

        self._memory.append(old_state, action, reward, done)

    def train(self):

        agent_step = self._num_actions_taken

        if agent_step >= self._train_after:
            if (agent_step % self._train_interval) == 0:
                pre_states, actions, post_states, rewards, terminals = self._memory.minibatch(self._minibatch_size)

                self._trainer.train_minibatch(
                    self._trainer.loss_function.argument_map(
                        pre_states=pre_states,
                        actions=Value.one_hot(actions.reshape(-1,1).tolist(), self.nb_actions),
                        post_states=post_states,
                        rewards=rewards,
                        terminals=terminals
                    )
                )

            if (agent_step % self._target_update_interval) == 0:
                self._target_net = self._action_value_net.clone(CloneMethod.freeze)
                filename = "model\model%d" % agent_step # save ???? not good at using %d
                self._trainer.save_checkpoint(filename)

    def _plot_metrics(self):

        if len(self._episode_q_means) > 0:
            mean_q = np.asscalar(np.mean(self._episode_q_means))
            self._metrics_writer.write_value('Mean Q per ep.', mean_q, self._num_actions_taken)

        if len(self._episode_q_stddev) > 0:
            std_q = np.asscalar(np.mean(self._episode_q_stddev))
            self._metrics_writer.write_value('Mean Std Q per ep.', std_q, self._num_actions_taken)

        self._metrics_writer.write_value('Sum rewards per ep', sum(self._episode_rewards), self._num_actions_taken)
Exemplo n.º 4
0
class DeepQAgent(object):
    """
    Implementation of Deep Q Neural Network agent like in:
        Nature 518. "Human-level control through deep reinforcement learning" (Mnih & al. 2015)
    """
    def __init__(self,
                 input_shape,
                 nb_actions,
                 gamma=0.99,
                 explorer=LinearEpsilonAnnealingExplorer(1, 0.1, 1000000),
                 learning_rate=0.00025,
                 momentum=0.95,
                 minibatch_size=32,
                 memory_size=500000,
                 train_after=200000,
                 train_interval=4,
                 target_update_interval=10000,
                 monitor=True):
        self.input_shape = input_shape
        self.nb_actions = nb_actions
        self.gamma = gamma

        self._train_after = train_after
        self._train_interval = train_interval
        self._target_update_interval = target_update_interval

        self._explorer = explorer
        self._minibatch_size = minibatch_size
        self._history = History(input_shape)
        self._memory = ReplayMemory(memory_size, input_shape[1:], 4)
        self._num_actions_taken = 0

        # Metrics accumulator
        self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], []

        # Action Value model (used by agent to interact with the environment)
        with default_options(activation=relu, init=he_uniform()):
            self._action_value_net = Sequential([
                Convolution2D((8, 8), 16, strides=4),
                Convolution2D((4, 4), 32, strides=2),
                Convolution2D((3, 3), 32, strides=1),
                Dense(256, init=he_uniform(scale=0.01)),
                Dense(nb_actions, activation=None, init=he_uniform(scale=0.01))
            ])
        self._action_value_net.update_signature(Tensor[input_shape])

        # Target model used to compute the target Q-values in training, updated
        # less frequently for increased stability.
        self._target_net = self._action_value_net.clone(CloneMethod.freeze)

        # Function computing Q-values targets as part of the computation graph
        @Function
        @Signature(post_states=Tensor[input_shape],
                   rewards=Tensor[()],
                   terminals=Tensor[()])
        def compute_q_targets(post_states, rewards, terminals):
            return element_select(
                terminals,
                rewards,
                gamma * reduce_max(self._target_net(post_states), axis=0) +
                rewards,
            )

        # Define the loss, using Huber Loss (more robust to outliers)
        @Function
        @Signature(pre_states=Tensor[input_shape],
                   actions=Tensor[nb_actions],
                   post_states=Tensor[input_shape],
                   rewards=Tensor[()],
                   terminals=Tensor[()])
        def criterion(pre_states, actions, post_states, rewards, terminals):
            # Compute the q_targets
            q_targets = compute_q_targets(post_states, rewards, terminals)

            # actions is a 1-hot encoding of the action done by the agent
            q_acted = reduce_sum(self._action_value_net(pre_states) * actions,
                                 axis=0)

            # Define training criterion as the Huber Loss function
            return huber_loss(q_targets, q_acted, 1.0)

        # Adam based SGD
        lr_schedule = learning_rate_schedule(learning_rate, UnitType.minibatch)
        m_schedule = momentum_schedule(momentum)
        vm_schedule = momentum_schedule(0.999)
        l_sgd = adam(self._action_value_net.parameters,
                     lr_schedule,
                     momentum=m_schedule,
                     variance_momentum=vm_schedule)

        self._metrics_writer = TensorBoardProgressWriter(
            freq=1, log_dir='metrics', model=criterion) if monitor else None
        self._learner = l_sgd
        self._trainer = Trainer(criterion, (criterion, None), l_sgd,
                                self._metrics_writer)

    def load(self, model_path):

        self._trainer.restore_from_checkpoint(model_path)

    def act(self, state, eval=False):
        """ This allows the agent to select the next action to perform in regard of the current state of the environment.
        It follows the terminology used in the Nature paper.

        Attributes:
            state (Tensor[input_shape]): The current environment state

        Returns: Int >= 0 : Next action to do
        """
        # Append the state to the short term memory (ie. History)
        self._history.append(state)

        # If policy requires agent to explore, sample random action
        if self._explorer.is_exploring(self._num_actions_taken) and not eval:
            action = self._explorer(self.nb_actions)
            q_values = None
        else:
            # Use the network to output the best action
            env_with_history = self._history.value
            q_values = self._action_value_net.eval(
                # Append batch axis with only one sample to evaluate
                env_with_history.reshape((1, ) + env_with_history.shape))

            self._episode_q_means.append(np.mean(q_values))
            self._episode_q_stddev.append(np.std(q_values))

            # Return the value maximizing the expected reward
            action = q_values.argmax()

        # Keep track of interval action counter
        self._num_actions_taken += 1
        return action, q_values

    def observe(self, old_state, action, reward, done):
        """ This allows the agent to observe the output of doing the action it selected through act() on the old_state

        Attributes:
            old_state (Tensor[input_shape]): Previous environment state
            action (int): Action done by the agent
            reward (float): Reward for doing this action in the old_state environment
            done (bool): Indicate if the action has terminated the environment
        """
        self._episode_rewards.append(reward)

        # If done, reset short term memory (ie. History)
        if done:
            # Plot the metrics through Tensorboard and reset buffers
            if self._metrics_writer is not None:
                self._plot_metrics()
            self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], []

            # Reset the short term memory
            self._history.reset()

        # Append to long term memory
        self._memory.append(old_state, action, reward, done)

    def train(self, checkpoint_dir):
        """ This allows the agent to train itself to better understand the environment dynamics.
        The agent will compute the expected reward for the state(t+1)
        and update the expected reward at step t according to this.

        The target expectation is computed through the Target Network, which is a more stable version
        of the Action Value Network for increasing training stability.

        The Target Network is a frozen copy of the Action Value Network updated as regular intervals.
        """

        agent_step = self._num_actions_taken

        if agent_step >= self._train_after:
            if (agent_step % self._train_interval) == 0:
                #print('training... number of steps: {}'.format(agent_step))

                pre_states, actions, post_states, rewards, terminals = self._memory.minibatch(
                    self._minibatch_size)
                self._trainer.train_minibatch(
                    self._trainer.loss_function.argument_map(
                        pre_states=pre_states,
                        actions=Value.one_hot(
                            actions.reshape(-1, 1).tolist(), self.nb_actions),
                        post_states=post_states,
                        rewards=rewards,
                        terminals=terminals))

                # Update the Target Network if needed
                if (agent_step % self._target_update_interval) == 0:
                    self._target_net = self._action_value_net.clone(
                        CloneMethod.freeze)
                    filename = os.path.join(checkpoint_dir,
                                            "models\model%d" % agent_step)
                    self._trainer.save_checkpoint(filename)

    def _plot_metrics(self):
        """Plot current buffers accumulated values to visualize agent learning
        """
        if len(self._episode_q_means) > 0:
            mean_q = np.asscalar(np.mean(self._episode_q_means))
            self._metrics_writer.write_value('Mean Q per ep.', mean_q,
                                             self._num_actions_taken)

        if len(self._episode_q_stddev) > 0:
            std_q = np.asscalar(np.mean(self._episode_q_stddev))
            self._metrics_writer.write_value('Mean Std Q per ep.', std_q,
                                             self._num_actions_taken)

        self._metrics_writer.write_value('Sum rewards per ep.',
                                         sum(self._episode_rewards),
                                         self._num_actions_taken)

    def get_depth_image(self, client):
        # get depth image from airsim
        responses = client.simGetImages([
            airsim.ImageRequest("RCCamera", airsim.ImageType.DepthPerspective,
                                True, False)
        ])

        img1d = np.array(responses[0].image_data_float, dtype=np.float)
        img1d = 255 / np.maximum(np.ones(img1d.size), img1d)
        if img1d.size > 1:

            img2d = np.reshape(img1d,
                               (responses[0].height, responses[0].width))

            image = Image.fromarray(img2d)
            im_final = np.array(image.resize((84, 84)).convert('L'))
            im_final = im_final / 255.0

            return im_final

        return np.zeros((84, 84)).astype(float)

    # Gets a coverage image from AirSim
    def get_cov_image(self, coverage_map):

        state, cov_reward = coverage_map.get_state_from_pose()
        #state = self.coverage_map.get_map_scaled()

        # debug only
        #im = Image.fromarray(np.uint8(state))
        #im.save("DistributedRL\\debug\\{}.png".format(time.time()))

        # normalize state
        state = state / 255.0

        return state, cov_reward
Exemplo n.º 5
0
class LearningAgent(object):
    def __init__(self,
                 state_dim,
                 action_dim,
                 gamma=0.99,
                 learning_rate=1e-4,
                 momentum=0.95):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.gamma = gamma

        with default_options(activation=relu, init=he_uniform()):
            # Convolution filter counts were halved to save on memory, no gpu :(
            self.model = Sequential([
                Convolution2D((8, 8), 16, strides=4, name='conv1'),
                Convolution2D((4, 4), 32, strides=2, name='conv2'),
                Convolution2D((3, 3), 32, strides=1, name='conv3'),
                Dense(256, init=he_uniform(scale=0.01), name='dense1'),
                Dense(action_dim,
                      activation=None,
                      init=he_uniform(scale=0.01),
                      name='actions')
            ])
            self.model.update_signature(Tensor[state_dim])

        # Create the target model as a copy of the online model
        self.target_model = None
        self.update_target()

        self.pre_states = input_variable(state_dim, name='pre_states')
        self.actions = input_variable(action_dim, name='actions')
        self.post_states = input_variable(state_dim, name='post_states')
        self.rewards = input_variable((), name='rewards')
        self.terminals = input_variable((), name='terminals')
        self.is_weights = input_variable((), name='is_weights')

        predicted_q = reduce_sum(self.model(self.pre_states) * self.actions,
                                 axis=0)

        # DQN - calculate target q values
        # post_q = reduce_max(self.target_model(self.post_states), axis=0)

        # DDQN - calculate target q values
        online_selection = one_hot(
            argmax(self.model(self.post_states), axis=0), self.action_dim)
        post_q = reduce_sum(self.target_model(self.post_states) *
                            online_selection,
                            axis=0)

        post_q = (1.0 - self.terminals) * post_q
        target_q = stop_gradient(self.rewards + self.gamma * post_q)

        # Huber loss
        delta = 1.0
        self.td_error = minus(predicted_q, target_q, name='td_error')
        abs_error = abs(self.td_error)
        errors = element_select(less(abs_error, delta),
                                square(self.td_error) * 0.5,
                                delta * (abs_error - 0.5 * delta))
        loss = errors * self.is_weights

        # Adam based SGD
        lr_schedule = learning_rate_schedule(learning_rate, UnitType.minibatch)
        m_scheule = momentum_schedule(momentum)
        vm_schedule = momentum_schedule(0.999)

        self._learner = adam(self.model.parameters,
                             lr_schedule,
                             m_scheule,
                             variance_momentum=vm_schedule)
        self.writer = TensorBoardProgressWriter(log_dir='metrics',
                                                model=self.model)
        self.trainer = Trainer(self.model, (loss, None), [self._learner],
                               self.writer)

    def act(self, state, epsilon):
        """
        Selects an action to take based on the epsilon greedy method
        :param state: The current state
        :param epsilon: Determines the amount of exploration. (1 - full exploration, 0 - no exploration)
        """
        if np.random.randn(1) < epsilon:
            # Explore (random action)
            return np.random.choice(self.action_dim)
        else:
            # Exploit (greedy action based on knowledge)
            return self.model.eval(state).argmax()

    def train(self, s, a, r, s_, t, w):
        """
        Updates the network weights using the given minibatch data
        :param s: Tensor[state_dim] Current state
        :param a: Tensor[int] Action taken at state s
        :param r: Tensor[float] State resulting from taking action a at state s
        :param s_: Tensor[state_dim] Reward received for taking action a at state s
        :param t: Tensor[boolean] True if s_ was a terminal state and false otherwise
        :param w: Tensor[float] Importance sampling weights
        """
        a = Value.one_hot(a.tolist(), self.action_dim)
        td_error = self.trainer.train_minibatch(
            {
                self.pre_states: s,
                self.actions: a,
                self.rewards: r,
                self.post_states: s_,
                self.terminals: t,
                self.is_weights: w
            },
            outputs=[self.td_error])
        return td_error[0]

    def update_target(self):
        """
        Update the target network using the online network weights
        """
        self.target_model = self.model.clone(CloneMethod.freeze)

    def checkpoint(self, filename):
        self.trainer.save_checkpoint(filename)

    def save_model(self, filename):
        self.model.save(filename)
Exemplo n.º 6
0
class DeepQAgent(object):
    """
    Implementation of Deep Q Neural Network agent like in:
        Nature 518. "Human-level control through deep reinforcement learning" (Mnih & al. 2015)
    """
    def __init__(self,
                 input_shape,
                 nb_actions,
                 gamma=0.99,
                 explorer=LinearEpsilonAnnealingExplorer(0.91, 0.1, 910000),
                 fixpolicy=LinearEpsilonAnnealingExplorer(0.5, 0.1, 100000),
                 learning_rate=0.00025,
                 momentum=0.95,
                 minibatch_size=32,
                 memory_size=500000,
                 train_after=10000,
                 train_interval=4,
                 target_update_interval=10000,
                 monitor=True):
        self.input_shape = input_shape
        self.nb_actions = nb_actions
        self.gamma = gamma

        self._train_after = train_after
        self._train_interval = train_interval
        self._target_update_interval = target_update_interval

        self._explorer = explorer
        self._fixpolicy = fixpolicy
        self._minibatch_size = minibatch_size
        self._history = History(input_shape)
        print("input_shape:", input_shape)
        print("input_shape[1:]", input_shape[1:])
        self._memory = ReplayMemory(memory_size, input_shape[1:], 4)
        self._num_actions_taken = 0

        # Metrics accumulator
        self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], []

        # Action Value model (used by agent to interact with the environment)
        with default_options(activation=relu, init=he_uniform()):
            self._action_value_net = Sequential([
                #Convolution2D((8, 8), 16, strides=4),
                #Convolution2D((4, 4), 32, strides=2),
                #Convolution2D((1, 1), 16, strides=1),
                Dense(25, init=he_uniform(scale=0.01)),
                Dense(nb_actions, activation=None, init=he_uniform(scale=0.01))
            ])
        self._action_value_net.update_signature(Tensor[input_shape])

        # Target model used to compute the target Q-values in training, updated
        # less frequently for increased stability.
        self._target_net = self._action_value_net.clone(CloneMethod.freeze)

        # Function computing Q-values targets as part of the computation graph
        @Function
        @Signature(post_states=Tensor[input_shape],
                   rewards=Tensor[()],
                   terminals=Tensor[()])
        def compute_q_targets(post_states, rewards, terminals):
            return element_select(
                terminals,
                rewards,
                gamma * reduce_max(self._target_net(post_states), axis=0) +
                rewards,
            )

        # Define the loss, using Huber Loss (more robust to outliers)
        @Function
        @Signature(pre_states=Tensor[input_shape],
                   actions=Tensor[nb_actions],
                   post_states=Tensor[input_shape],
                   rewards=Tensor[()],
                   terminals=Tensor[()])
        def criterion(pre_states, actions, post_states, rewards, terminals):
            # Compute the q_targets
            q_targets = compute_q_targets(post_states, rewards, terminals)

            # actions is a 1-hot encoding of the action done by the agent
            q_acted = reduce_sum(self._action_value_net(pre_states) * actions,
                                 axis=0)

            # Define training criterion as the Huber Loss function
            return huber_loss(q_targets, q_acted, 1.0)

        # Adam based SGD
        lr_schedule = learning_rate_schedule(learning_rate, UnitType.minibatch)
        m_schedule = momentum_schedule(momentum)
        vm_schedule = momentum_schedule(0.999)
        l_sgd = adam(self._action_value_net.parameters,
                     lr_schedule,
                     momentum=m_schedule,
                     variance_momentum=vm_schedule)

        self._metrics_writer = TensorBoardProgressWriter(
            freq=1, log_dir='metrics', model=criterion) if monitor else None
        self._learner = l_sgd
        self._trainer = Trainer(criterion, (criterion, None), l_sgd,
                                self._metrics_writer)
        #self._trainer.restore_from_checkpoint("models_heuristic_no_image\model")
    def act(self, state):
        """ This allows the agent to select the next action to perform in regard of the current state of the environment.
        It follows the terminology used in the Nature paper.

        Attributes:
            state (Tensor[input_shape]): The current environment state

        Returns: Int >= 0 : Next action to do
        """
        # Append the state to the short term memory (ie. History)
        self._history.append(state)
        #if True:
        if self._fixpolicy.is_exploring(self._num_actions_taken):
            diff_x = state[3] - state[0]
            diff_y = state[4] - state[1]
            diff_z = state[5] - state[2]
            diff_arr = np.array([diff_x, diff_y, diff_z])
            direction = np.argmax(np.absolute(diff_arr))
            '''
            abs_x = math.fabs(diff_x)
            abs_y = math.fabs(diff_y)
            abs_z = math.fabs(diff_z)
            diff = [diff_x, diff_y, diff_z]
            abs_diff = [abs_x, abs_y, abs_z]
            print(diff, abs_diff)
            m = max(abs_diff)
            direction = diff.index(m)'''
            print(diff_arr)
            if diff_arr[direction] < 0:
                fixaction = direction + 4
            else:
                fixaction = direction + 1

            self._num_actions_taken += 1
            return fixaction

        # If policy requires agent to explore, sample random action
        if self._explorer.is_exploring(self._num_actions_taken):
            action = self._explorer(self.nb_actions)
        else:
            # Use the network to output the best action
            env_with_history = self._history.value
            q_values = self._action_value_net.eval(
                # Append batch axis with only one sample to evaluate
                env_with_history.reshape((1, ) + env_with_history.shape))

            self._episode_q_means.append(np.mean(q_values))
            self._episode_q_stddev.append(np.std(q_values))

            # Return the value maximizing the expected reward
            action = q_values.argmax()

        # Keep track of interval action counter
        self._num_actions_taken += 1
        return action

    def observe(self, old_state, action, reward, done):
        """ This allows the agent to observe the output of doing the action it selected through act() on the old_state

        Attributes:
            old_state (Tensor[input_shape]): Previous environment state
            action (int): Action done by the agent
            reward (float): Reward for doing this action in the old_state environment
            done (bool): Indicate if the action has terminated the environment
        """
        self._episode_rewards.append(reward)

        # If done, reset short term memory (ie. History)
        if done:
            # Plot the metrics through Tensorboard and reset buffers
            if self._metrics_writer is not None:
                self._plot_metrics()
            self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], []

            # Reset the short term memory
            self._history.reset()

        # Append to long term memory
        self._memory.append(old_state, action, reward, done)

    def train(self):
        """ This allows the agent to train itself to better understand the environment dynamics.
        The agent will compute the expected reward for the state(t+1)
        and update the expected reward at step t according to this.

        The target expectation is computed through the Target Network, which is a more stable version
        of the Action Value Network for increasing training stability.

        The Target Network is a frozen copy of the Action Value Network updated as regular intervals.
        """

        agent_step = self._num_actions_taken
        print("agent_step = ", agent_step)
        #time.sleep(1)
        if agent_step >= self._train_after:
            if (agent_step % self._train_interval) == 0:
                pre_states, actions, post_states, rewards, terminals = self._memory.minibatch(
                    self._minibatch_size)

                self._trainer.train_minibatch(
                    self._trainer.loss_function.argument_map(
                        pre_states=pre_states,
                        actions=Value.one_hot(
                            actions.reshape(-1, 1).tolist(), self.nb_actions),
                        post_states=post_states,
                        rewards=rewards,
                        terminals=terminals))

                # Update the Target Network if needed
                if (agent_step % self._target_update_interval) == 0:
                    self._target_net = self._action_value_net.clone(
                        CloneMethod.freeze)
                    filename = "models_heuristic_no_image_less_exploration\model%d" % agent_step
                    print(
                        "$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$filename=",
                        filename)
                    self._trainer.save_checkpoint(filename)
                    #time.sleep(100)

    def _plot_metrics(self):
        global landing_count, episode_count
        """Plot current buffers accumulated values to visualize agent learning
        """
        f = open('log__heuristic_no_image_less_exploration2', 'a+')
        f.write('episode:' + str(episode_count) + ': exploration rate= ' +
                str(self._explorer._rate) + ' heuristic fix rate= ' +
                str(self._fixpolicy._rate) + '\n')
        if len(self._episode_q_means) > 0:
            mean_q = np.asscalar(np.mean(self._episode_q_means))
            self._metrics_writer.write_value('Mean Q per ep.', mean_q,
                                             self._num_actions_taken)
            print('Mean Q per ep.', mean_q, self._num_actions_taken)
            f.write('Mean Q per ep. ' + str(mean_q) + ' ' +
                    str(self._num_actions_taken) + '\n')

        if len(self._episode_q_stddev) > 0:
            std_q = np.asscalar(np.mean(self._episode_q_stddev))
            self._metrics_writer.write_value('Mean Std Q per ep.', std_q,
                                             self._num_actions_taken)
            print('Mean Std Q per ep.', std_q, self._num_actions_taken)
            f.write('Mean Std Q per ep. ' + str(std_q) + ' ' +
                    str(self._num_actions_taken) + '\n')

        self._metrics_writer.write_value('Sum rewards per ep.',
                                         sum(self._episode_rewards),
                                         self._num_actions_taken)
        print('Sum rewards per ep.', sum(self._episode_rewards),
              self._num_actions_taken)
        f.write('Sum rewards per ep. ' + str(sum(self._episode_rewards)) +
                ' ' + str(self._num_actions_taken) + '\n')
        if landing_count > 0:
            f.write('****************Success landing**********' +
                    str(landing_count) + '\n')
        landing_count = 0
        episode_count = 0

        f.write('\n')