Пример #1
0
class DDPG(BaseAgent):
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task):
        # Task (environment) information
        self.task = task  # should contain observation_space and action_space

        # Constrain state and action spaces
        self.state_size = 1  # position only
        self.state_range = self.task.observation_space.high[
            2] - self.task.observation_space.low[2]
        self.action_size = 1  # force only
        self.action_range = self.task.action_space.high[
            2] - self.task.action_space.low[2]

        print("Original spaces: {}, {}\nConstrained spaces: {}, {}".format(
            self.task.observation_space.shape, self.task.action_space.shape,
            self.state_size, self.action_size))

        # Actor (Policy) Model
        self.action_low = self.task.action_space.low[2]
        self.action_high = self.task.action_space.high[2]
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.noise = OUNoise(self.action_size)
        #print('Noise generated')

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size)
        print('Replay Buffer initialized')

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.001  # for soft update of target parameters

        # Score tracker and learning parameters
        self.best_w = None
        self.best_score = -np.inf
        self.noise_scale = 0.1

        # Episode variables
        self.reset_episode_vars()

        # Save episode stats
        self.stats_filename = os.path.join(
            util.get_param('out'),
            "stats_{}.csv".format(util.get_timestamp()))  # path to CSV file
        self.episode_num = 1

    def reset_episode_vars(self):
        self.last_state = None
        self.last_action = None
        self.total_reward = 0.0
        self.count = 0

    def step(self, state, reward, done):
        # Reduce state vector
        state = self.preprocess_state(state)

        # Transform state vector
        state = (state - self.task.observation_space.low[2]
                 ) / self.state_range  # scale to [0.0, 1.0]
        state = state.reshape(1, -1)  # convert to row vector
        #print('Transform state vector')
        # Choose an action
        action = self.act(state)

        # Save experience / reward
        if self.last_state is not None and self.last_action is not None:
            self.memory.add(self.last_state, self.last_action, reward, state,
                            done)
            self.total_reward += reward
            self.count += 1

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            # Write episode stats
            self.write_stats([self.episode_num, self.total_reward])
            self.episode_num += 1
            # Learn from saved experiences
            experiences = self.memory.sample(self.batch_size)
            self.learn(experiences)
            #self.reset_episode_vars()

        self.last_state = state
        self.last_action = action
        return self.postprocess_action(action)

    def act(self, states):
        """Returns actions for given state(s) as per current policy."""
        states = np.reshape(states, [-1, self.state_size])
        actions = self.actor_local.model.predict(states)
        #print('Action taken!')
        return actions + self.noise.sample()  # add some noise for exploration

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)

    def preprocess_state(self, state):
        """Reduce state vector to relevant dimensions."""
        #print('State preprocessed')
        return state[2]  # position only

    def postprocess_action(self, action):
        """Return complete action vector."""
        complete_action = np.zeros(self.task.action_space.shape)  # shape: (6,)
        complete_action[2] = action  # linear force only
        #print('State postprocessed')
        return complete_action

    def write_stats(self, stats):
        """Write single episode stats to CSV file."""
        df_stats = pd.DataFrame([stats], columns=['episode', 'total_reward'
                                                  ])  # single-row dataframe
        df_stats.to_csv(
            self.stats_filename,
            mode='a',
            index=False,
            header=not os.path.isfile(
                self.stats_filename))  # write header first time only
Пример #2
0
class Task01_DDPG(BaseAgent):
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task):

        # Task (environment) information
        self.task = task  # should contain observation_space and action_space
        #self.state_size = np.prod(self.task.observation_space.shape)
        # self.task.observation_space.high = self.task.observation_space.high[2:3]
        # self.task.observation_space.low =  self.task.observation_space.low[2:3]
        self.state_range = self.task.observation_space.high - self.task.observation_space.low
        #self.action_size = np.prod(self.task.action_space.shape)
        self.action_range = self.task.action_space.high - self.task.action_space.low
        self.task.observation_space.high = self.task.observation_space.high[
            2:3]
        self.task.observation_space.low = self.task.observation_space.low[2:3]

        #self.state_range = self.state_range[2:3]
        #self.action_range = self.action_range[2:3]

        # Constrain state and action spaces
        self.state_size = 1  # position only
        self.action_size = 1  # force only
        self.action_low = self.task.action_space.low[2:3]
        self.action_high = self.task.action_space.high[2:3]
        print("Original spaces: {}, {}\nConstrained spaces: {}, {}".format(
            self.task.observation_space.shape, self.task.action_space.shape,
            self.state_size, self.action_size))

        # # Policy parameters
        # self.w = np.random.normal(
        #     size=(self.state_size, self.action_size),  # weights for simple linear policy: state_space x action_space
        #     scale=(self.action_range / (2 * self.state_size)).reshape(1, -1))  # start producing actions in a decent range

        # Score tracker and learning parameters
        self.best_w = None
        self.best_score = -np.inf
        self.noise_scale = 0.1

        # Actor (Policy) Model
        #self.action_low = self.task.action_space.low
        #self.action_high = self.task.action_space.high
        self.state_range = self.state_range[2:3]
        self.action_range = self.action_range[2:3]
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.noise = OUNoise(self.action_size)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.001  # for soft update of target parameters

        # Episode variables
        #self.reset_episode_vars()

        #---------------------------------------
        # Saving data

        self.stats_filename = os.path.join(
            util.get_param('out') + '/task01/',
            "stats_{}.csv".format(util.get_timestamp()))  # path to CSV file
        self.stats_columns = ['episode',
                              'total_reward']  # specify columns to save
        self.episode_num = 1
        print("Saving stats {} to {}".format(self.stats_columns,
                                             self.stats_filename))

        # Load/save parameters
        self.load_weights = True  # try to load weights from previously saved models
        self.save_weights_every = 1  # save weights every n episodes, None to disable
        self.model_dir = util.get_param(
            'out'
        ) + '/task01'  # you can use a separate subdirectory for each task and/or neural net architecture
        self.model_name = "my-model_" + util.get_timestamp()
        self.model_ext = ".h5"
        # if self.load_weights or self.save_weights_every:
        #     self.actor_filename_local = os.path.join(self.model_dir,
        #         "{}_actor_local{}".format(self.model_name, self.model_ext))
        #     self.critic_filename_local = os.path.join(self.model_dir,
        #         "{}_critic_local{}".format(self.model_name, self.model_ext))
        #     self.actor_filename_target = os.path.join(self.model_dir,
        #         "{}_actor_target{}".format(self.model_name, self.model_ext))
        #     self.critic_filename_target = os.path.join(self.model_dir,
        #         "{}_critic_target{}".format(self.model_name, self.model_ext))
        #     print("Actor local filename :", self.actor_filename_local)  # [debug]
        #     print("Critic local filename:", self.critic_filename_local)  # [debug]
        #     print("Actor target filename :", self.actor_filename_target)  # [debug]
        #     print("Critic target filename:", self.critic_filename_target)  # [debug]

        # Load pre-trained model weights, if available
        #if self.load_weights and os.path.isfile(self.actor_filename_local):
        if self.load_weights:
            try:

                date_of_file = '2018-02-20_11-28-13'
                #date_of_file = '2018-02-20_11-22-27'
                self.actor_filename_local = os.path.join(
                    self.model_dir,
                    'my-model_{}_actor_local.h5'.format(date_of_file))
                self.critic_filename_local = os.path.join(
                    self.model_dir,
                    'my-model_{}_critic_local.h5'.format(date_of_file))
                self.actor_filename_target = os.path.join(
                    self.model_dir,
                    'my-model_{}_actor_target.h5'.format(date_of_file))
                self.critic_filename_target = os.path.join(
                    self.model_dir,
                    'my-model_{}_critic_target.h5'.format(date_of_file))

                self.actor_local.model.load_weights(self.actor_filename_local)
                self.critic_local.model.load_weights(
                    self.critic_filename_local)
                self.actor_target.model.load_weights(
                    self.actor_filename_target)
                self.critic_target.model.load_weights(
                    self.critic_filename_target)
                print("Model weights loaded from file: {}, {}, {}, {}".format(
                    self.actor_filename_local, self.critic_filename_local,
                    self.actor_filename_target,
                    self.critic_filename_target))  # [debug]
            except Exception as e:
                print("Unable to load model weights from file: {}, {}, {}, {}".
                      format(self.actor_filename_local,
                             self.critic_filename_local,
                             self.actor_filename_target,
                             self.critic_filename_target))
                print("{}: {}".format(e.__class__.__name__, str(e)))

        # Set the name of the weight files to this current time stamp, even if loaded from another timestamp.
        self.actor_filename_local = os.path.join(
            self.model_dir, "{}_actor_local{}".format(self.model_name,
                                                      self.model_ext))
        self.critic_filename_local = os.path.join(
            self.model_dir, "{}_critic_local{}".format(self.model_name,
                                                       self.model_ext))
        self.actor_filename_target = os.path.join(
            self.model_dir, "{}_actor_target{}".format(self.model_name,
                                                       self.model_ext))
        self.critic_filename_target = os.path.join(
            self.model_dir,
            "{}_critic_target{}".format(self.model_name, self.model_ext))

        if self.save_weights_every:
            print("Saving model weights",
                  "every {} episodes".format(self.save_weights_every)
                  if self.save_weights_every else "disabled")  # [debug]

        # Episode variables
        self.episode = 0
        self.reset_episode_vars()

        #---------------------------------------

    def reset_episode_vars(self):
        self.last_state = None
        self.last_action = None
        self.total_reward = 0.0
        self.count = 0

    def write_stats(self, stats):
        """Write single episode stats to CSV file."""
        df_stats = pd.DataFrame(
            [stats], columns=self.stats_columns)  # single-row dataframe
        df_stats.to_csv(
            self.stats_filename,
            mode='a',
            index=False,
            header=not os.path.isfile(
                self.stats_filename))  # write header first time only

    def step(self, state, reward, done):

        # Reduce state vector
        state = self.preprocess_state(state)

        # Transform state vector
        state = (state - self.task.observation_space.low
                 ) / self.state_range  # scale to [0.0, 1.0]
        state = state.reshape(1, -1)  # convert to row vector

        # Choose an action
        action = self.act(state)

        # Save experience / reward
        if self.last_state is not None and self.last_action is not None:
            self.memory.add(self.last_state, self.last_action, reward, state,
                            done)

            self.total_reward += reward
            self.count += 1

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample(self.batch_size)
            self.learn(experiences)

        #----------------------

        # Learn, if at end of episode
        if done:

            # Write episode stats
            self.write_stats([self.episode_num, self.total_reward])
            self.episode_num += 1

            # Save model weights at regular intervals
            if self.save_weights_every and self.episode % self.save_weights_every == 0:
                self.actor_local.model.save_weights(self.actor_filename_local)
                self.critic_local.model.save_weights(
                    self.critic_filename_local)
                self.actor_target.model.save_weights(
                    self.actor_filename_target)
                self.critic_target.model.save_weights(
                    self.critic_filename_target)
                print(
                    "Model weights saved at episode {}. Model files: {}. {}, {}, {}"
                    .format(self.episode, self.actor_filename_local,
                            self.critic_filename_local,
                            self.actor_filename_target,
                            self.critic_filename_target))  # [debug]

            self.learn(experiences)
            self.reset_episode_vars()

        self.last_state = state
        self.last_action = action
        #return action

        # Return complete action vector
        return self.postprocess_action(action)

    def act(self, states):
        """Returns actions for given state(s) as per current policy."""
        states = np.reshape(states, [-1, self.state_size])
        actions = self.actor_local.model.predict(states)
        return actions + self.noise.sample()  # add some noise for exploration

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)

    def preprocess_state(self, state):
        """Reduce state vector to relevant dimensions."""
        return state[2:3]  # position only

    def postprocess_action(self, action):
        """Return complete action vector."""
        complete_action = np.zeros(self.task.action_space.shape)  # shape: (6,)
        complete_action[2:3] = action  # linear force only
        return complete_action
class DDPG(BaseAgent):
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task):
        # Task (environment) information
        self.task = task  # should contain observation_space and action_space

        # Load/save parameters
        self.load_weights = True  # try to load weights from previously saved models
        self.save_weights_every = 5  # save weights every n episodes, None to disable
        self.model_dir = util.get_param(
            'out'
        )  # you can use a separate subdirectory for each task and/or neural net architecture
        self.model_name = "ddpg_takeoff"
        self.model_ext = ".h5"
        if self.load_weights or self.save_weights_every:
            self.actor_filename = os.path.join(
                self.model_dir, "{}_actor{}".format(self.model_name,
                                                    self.model_ext))
            self.critic_filename = os.path.join(
                self.model_dir, "{}_critic{}".format(self.model_name,
                                                     self.model_ext))
            print("Actor filename :", self.actor_filename)  # [debug]
            print("Critic filename:", self.critic_filename)  # [debug]

        # Constrain state and action spaces
        self.state_size = 1  # position only
        self.state_range = self.task.observation_space.high[
            2] - self.task.observation_space.low[2]
        self.action_size = 1  # force only
        self.action_range = self.task.action_space.high[
            2] - self.task.action_space.low[2]

        print("Original spaces: {}, {}\nConstrained spaces: {}, {}".format(
            self.task.observation_space.shape, self.task.action_space.shape,
            self.state_size, self.action_size))

        # Actor (Policy) Model
        self.action_low = self.task.action_space.low[2]
        self.action_high = self.task.action_space.high[2]
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Load pre-trained model weights, if available
        if self.load_weights and os.path.isfile(self.actor_filename):
            try:
                self.actor_local.model.load_weights(self.actor_filename)
                self.critic_local.model.load_weights(self.critic_filename)
                print("Model weights loaded from file!")  # [debug]
            except Exception as e:
                print("Unable to load model weights from file!")
                print("{}: {}".format(e.__class__.__name__, str(e)))

        if self.save_weights_every:
            print("Saving model weights",
                  "every {} episodes".format(self.save_weights_every)
                  if self.save_weights_every else "disabled")  # [debug]

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.noise = OUNoise(self.action_size)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size)
        print('Replay Buffer initialized')

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.01  # for soft update of target parameters

        # Episode variables
        self.episode_num = 0
        self.reset_episode_vars()
        # Save episode stats
        self.stats_filename = os.path.join(
            util.get_param('out'), "ddpg_takeoff_stats_{}.csv".format(
                util.get_timestamp()))  # path to CSV file

    def reset_episode_vars(self):
        self.last_state = None
        self.last_action = None
        self.total_reward = 0.0
        self.count = 0  ## THIS IS NOT USED
        self.episode_num += 1

    def step(self, state, reward, done):
        # Reduce state vector
        state = self.preprocess_state(state)

        # Transform state vector
        state = (state - self.task.observation_space.low[2]
                 ) / self.state_range  # scale to [0.0, 1.0]
        state = state.reshape(1, -1)  # convert to row vector

        # Choose an action
        action = self.act(state)

        # Save experience / reward
        if self.last_state is not None and self.last_action is not None:
            self.memory.add(self.last_state, self.last_action, reward, state,
                            done)
            self.total_reward += reward
            self.count += 1  ## THIS IS NOT USED

        # Learn, if enough samples are available in memory
        #print('length memory: {}, batch size: {}'.format(len(self.memory),self.batch_size))
        if len(self.memory) > self.batch_size:
            # Learn from saved experiences
            experiences = self.memory.sample(self.batch_size)
            #print('learning')
            self.learn(experiences)
            #print('learned')

        if done:
            # Save model weights at regular intervals
            if self.save_weights_every and self.episode_num % self.save_weights_every == 0:
                self.actor_local.model.save_weights(self.actor_filename)
                self.critic_local.model.save_weights(self.critic_filename)
                print("Model weights saved at episode",
                      self.episode_num)  # [debug]
            # Write episode stats
            self.write_stats([self.episode_num, self.total_reward])
            print('Amount of steps in this episode:', self.count)
            self.reset_episode_vars()

        self.last_state = state
        self.last_action = action
        return self.postprocess_action(action)

    def act(self, states):
        """Returns actions for given state(s) as per current policy."""
        states = np.reshape(states, [-1, self.state_size])
        actions = self.actor_local.model.predict(states)
        return actions + self.noise.sample()  # add some noise for exploration

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)

    def preprocess_state(self, state):
        """Reduce state vector to relevant dimensions."""
        return state[2]  # z-position only

    def postprocess_action(self, action):
        """Return complete action vector."""
        complete_action = np.zeros(self.task.action_space.shape)  # shape: (6,)
        complete_action[2] = action  # linear force only
        return complete_action

    def write_stats(self, stats):
        """Write single episode stats to CSV file."""
        df_stats = pd.DataFrame([stats], columns=['episode', 'total_reward'
                                                  ])  # single-row dataframe
        df_stats.to_csv(
            self.stats_filename,
            mode='a',
            index=False,
            header=not os.path.isfile(
                self.stats_filename))  # write header first time only
class DDPG(BaseAgent):
    def __init__(self, task):
        print('start DDPG')
        self.task = task
        self.state_size = 1
        self.action_size = 1
        self.space_low = self.task.observation_space.low[2:3]

        self.stats_filename = os.path.join(
            util.get_param('out'),
            "stats_{}.csv".format(util.get_timestamp()))  # path to CSV file
        self.stats_columns = ['episode', 'total_reward']  # specify columns to save

        # Episode variables
        self.reset_episode_vars()
        self.actor_learning_rate = 0.0001
        self.tau = 0.99
        self.mini_batch_size = 64
        self.buffer_size = 100000
        self.critic_learning_rate = 0.001
        self.gamma = 0.88
        self.episode = 0

        # Load/save parameters
        self.load_weights = False  # try to load weights from previously saved models
        self.save_weights_every = 50  # save weights every n episodes, None to disable
        self.model_dir = util.get_param(
            'out')  # you can use a separate subdirectory for each task and/or neural net architecture
        self.model_name = "my-model4"  #my-model3
        self.model_ext = ".h5"
        if self.load_weights or self.save_weights_every:
            self.actor_filename = os.path.join(self.model_dir,
                                               "{}_actor{}".format(self.model_name, self.model_ext))
            self.critic_filename = os.path.join(self.model_dir,
                                                "{}_critic{}".format(self.model_name, self.model_ext))
            print("Actor filename :", self.actor_filename)  # [debug]
            print("Critic filename:", self.critic_filename)  # [debug]

        self.memory = ReplayBuffer(self.buffer_size)

        self.action_low = self.task.action_space.low[2:3]
        self.action_high = self.task.action_space.high[2:3]
        self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        if self.load_weights and os.path.isfile(self.actor_filename):
            try:
                self.actor_local.model.load_weights(self.actor_filename)
                self.critic_local.model.load_weights(self.critic_filename)
                print("Model weights loaded from file!")  # [debug]
            except Exception as e:
                print("Unable to load model weights from file!")
                print("{}: {}".format(e.__class__.__name__, str(e)))

        if self.save_weights_every:
            print("Saving model weights", "every {} episodes".format(
                self.save_weights_every) if self.save_weights_every else "disabled")

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(self.actor_local.model.get_weights())

        self.actor_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(self.action_size))

    def write_stats(self, stats):
        """Write single episode stats to CSV file."""
        df_stats = pd.DataFrame([stats], columns=self.stats_columns)  # single-row dataframe
        df_stats.to_csv(self.stats_filename, mode='a', index=False,
                        header=not os.path.isfile(self.stats_filename))  # write header first time only

    def reset_episode_vars(self):
        self.last_state = None
        self.last_action = None
        self.total_reward = 0.0
        self.count = 0

    def postprocess_action(self, action):
        complete_action = np.zeros(self.task.action_space.shape)  # shape: (6,)
        complete_action[2:3] = action  # linear force only
        return complete_action

    def step(self, state, reward, done):
        # Transform state vector
        old_height = state[2:3]
        state = (old_height - self.space_low) / self.state_size  # scale to [0.0, 1.0]
        state = state.reshape(1, -1)  # convert to row vector

        # Choose an action
        action = self.act(state)

        # Save experience / reward
        if self.last_state is not None and self.last_action is not None:
            self.memory.add(self.last_state, self.last_action, reward, state, done)
            self.total_reward += reward
            self.count += 1

        if len(self.memory) > self.mini_batch_size:
            self.learn(self.memory.sample(self.mini_batch_size))

        if done:
            print('reward', self.total_reward, "height", old_height)
            if self.save_weights_every and self.episode % self.save_weights_every == 0:
                self.actor_local.model.save_weights(self.actor_filename)
                self.critic_local.model.save_weights(self.critic_filename)
                #print("Model weights saved at episode", self.episode)

            self.write_stats([self.episode, self.total_reward])
            self.episode += 1
            self.reset_episode_vars()

        final_action = self.actor_target.model.predict_on_batch(state)
        self.last_state = state
        self.last_action = final_action

        return self.postprocess_action(final_action)

    def act(self, states):
        actions = self.actor_local.model.predict(states)
        return actions + self.actor_noise.sample()

    def learn(self, experiences):
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape(-1,
                                                                                                        self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack([e.next_state for e in experiences if e is not None])

        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch([next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(self.critic_local.get_action_gradients([states, actions, 0]), (-1,
                                                                                                     self.action_size))
        self.actor_local.train_fn([states, action_gradients, 1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        new_weights = self.tau * local_weights + (1 - self.tau) * target_weights
        target_model.set_weights(new_weights)
Пример #5
0
class DDPG(BaseAgent):
    def __init__(self, task):
        self.task = task

        self.state_size = 3  # position only
        self.action_size = 3  #  force only
        self.action_low = self.task.action_space.low[0:3]
        self.action_high = self.task.action_space.high[0:3]
        print("Original spaces: {}, {}\nConstrained spaces: {}, {}".format(
            self.task.observation_space.shape, self.task.action_space.shape,
            self.state_size, self.action_size))

        #load/save parameters
        self.load_weights = True  # try to load weights from previously saved models
        self.save_weights_every = 100  # None to disable
        self.model_dir = util.get_param('out')

        self.model_name = "my-model"
        self.model_ext = ".h5"
        self.episode = 0
        if self.load_weights or self.save_weights_every:
            self.actor_filename = os.path.join(
                self.model_dir, "{}_actor{}".format(self.model_name,
                                                    self.model_ext))
            self.critic_filename = os.path.join(
                self.model_dir, "{}_critic{}".format(self.model_name,
                                                     self.model_ext))
            print("Actor filename:", self.actor_filename)
            print("Critic filename:", self.critic_filename)

        # Actor(Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic(Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Load pre-trained model weights, if available
        if self.load_weights and os.path.isfile(self.actor_filename):
            try:
                self.actor_local.model.load_weights(self.actor_filename)
                self.critic_local.model.load_weights(self.critic_filename)
                print("Model weights loaded from file")  # [debug]
            except Exception as e:
                print("Unable to load model weights from file!")
                print("{}: {}".format(e.__class__.__name__, str(e)))

        if self.save_weights_every:
            print("Saving model weights",
                  "every {} episodes".format(self.save_weights_every)
                  if self.save_weights_every else "disabled")  # [debug]

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.noise = OUNoise(self.action_size)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size)

        # Algorithm parameters
        self.gamma = 0.99  #discount factor
        self.tau = 0.001  # for soft

        self.rewards_list = []
        self.reset_episode_vars()

        # Save episode stats
        self.stats_filename = os.path.join(
            util.get_param('out'), "stats_{}.csv".format(util.get_timestamp()))
        self.stats_columns = ['episode',
                              'total_reward']  # specify column to save
        self.episode_num = 1
        print("Saving stats {} to {}".format(self.stats_columns,
                                             self.stats_filename))  # debug
        #print("init complete") #[debug]

    def reset_episode_vars(self):
        self.last_state = None
        self.last_action = None
        self.total_reward = 0.0
        self.count = 0
        self.episode += 1

    def preprocess_state(self, state):
        """Reduce state vector to relevant dimensions"""
        return state[0:3]  # position only

    def postprocess_action(self, action):
        """Return complete action vector"""
        complete_action = np.zeros(self.task.action_space.shape)  # shape (6,)
        complete_action[0:3] = action  # linear force only
        return complete_action

    def step(self, state, reward, done):
        #print("take a step") #[debug]

        # Reduce state vector
        state = self.preprocess_state(state)

        # Choose an action (get action through local actor network)
        action = self.act(state)

        # Save experience/reward
        if self.last_state is not None and self.last_action is not None:
            self.memory.add(self.last_state, self.last_action, reward, state,
                            done)
            self.total_reward += reward
            self.count += 1

        # Learn, if replay buffer is ample to sample experiences (online learning)
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample(self.batch_size)
            self.learn(experiences)

        if done:
            #print("Done") #[debug]
            self.write_stats([self.episode_num, self.total_reward])
            self.episode_num += 1

            # Save model weights at regular intervals
            if self.save_weights_every and self.episode % self.save_weights_every == 0:
                self.actor_local.model.save_weights(self.actor_filename)
                self.critic_local.model.save_weights(self.critic_filename)
                print("Model weights saved at episode",
                      self.episode)  # [debug]
            self.reset_episode_vars()

        self.last_state = state
        self.last_action = action
        #print("end of step") #[debug]
        return self.postprocess_action(action)

    # to save rewards stats
    def write_stats(self, stats):
        """Write single episode stats to CSV file"""
        df_stats = pd.DataFrame([stats], columns=self.stats_columns)
        df_stats.to_csv(self.stats_filename,
                        mode='a',
                        index=False,
                        header=not os.path.isfile(self.stats_filename))
        print(stats)  # debug

    def learn(self, experiences):
        #print("start learn") #[debug]
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from targets models
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # Compute Q targets for current states and train critic model(local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train actor model (local)
        # learning_phase() = 0 -> test mode
        # learning_phase() = 1 -> train mode
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients, 1])

        # Soft-update target models
        self.soft_update(self.actor_local, self.actor_target)
        self.soft_update(self.critic_local, self.critic_target)

    def act(self, states):
        #print("act") #[debug]
        states = np.reshape(states, [-1, self.state_size])
        actions = self.actor_local.model.predict(states)
        return actions + self.noise.sample()  # add some noise for exploration

    def soft_update(self, local_model, target_model):
        local_weights = np.array(local_model.model.get_weights())
        target_weights = np.array(target_model.model.get_weights())

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.model.set_weights(new_weights)
Пример #6
0
class DDPG(BaseAgent):
    """Sample agent that searches for optimal policy randomly."""

    def setup_weights(self):
        # save weights
        self.load_weights = True
        self.save_weights_every = 50
        self.model_dir = util.get_param('out')
        self.model_name = "ddpg"
        self.model_ext = ".h5"
        if self.load_weights or self.save_weights_every:
            self.actor_filename = os.path.join(self.model_dir,
                    "{}_actor{}".format(self.model_name, self.model_ext))
            self.critic_filename = os.path.join(self.model_dir,
                    "{}_critic{}".format(self.model_name, self.model_ext))
            print("Actor filename :", self.actor_filename)
            print("Critic filename:", self.critic_filename)
        if self.load_weights and os.path.isfile(self.actor_filename):
            try:
                self.actor_local.model.load_weights(self.actor_filename)
                self.critic_local.model.load_weights(self.critic_filename)
                print("Model weights loaded from file!")
            except Exception as e:
                print("Unable to load model weights from file!")
                print("{}: {}".format(e.__class__.__name__, str(e)))
        else:
            self.critic_target.set_weights(self.critic_local)
            self.actor_target.set_weights(self.actor_local)

    def __init__(self, task):
        self.task = task
        self.state_size = 3
        self.action_size = 3

        #set action space limits
        self.action_low = self.task.action_space.low[0:3]
        self.action_high = self.task.action_space.high[0:3]
        print("Original spaces: {}, {}\nConstrained spaces: {}, {}".format(
            self.task.observation_space.shape, self.task.action_space.shape,
            self.state_size, self.action_size))

        action = [self.action_size, self.action_low, self.action_high]

        #Initialize network
        #Actor
        self.actor_local = Actor(self.state_size, action)
        self.actor_target = Actor(self.state_size, action)
        #Critic
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)
        self.setup_weights()

        #noise
        self.noise = OUNoise(self.action_size)

        #Replay buffer
        self.buffer_size = 100000
        self.batch_size = 128 
        self.memory = ReplayBuffer(self.buffer_size)

        #Hyper params
        self.gamma = 0.99  # discount factor
        self.tau = 0.001  # for soft update of target parameters

        # log file
        self.stats = os.path.join(util.get_param('out'), "stats_{}.csv".format(
          util.get_timestamp()))
        self.episode_no = 1
        self.stats_columns = ['episodes', 'total_reward']
        print("Saving stats {} to {}".format(self.stats_columns, self.stats))


        # Episode variables
        self.reset_episode_vars()

    def preprocess_state(self, state):
        return state[0:3]

    def postprocess_action(self, action):
        constrained_action = np.zeros(self.task.action_space.shape)
        constrained_action[0:3] = action
        return constrained_action

    def write(self, data):
        df_stats = pd.DataFrame([data], columns=self.stats_columns)
        df_stats.to_csv(self.stats, mode='a', index=False,
         header=not os.path.isfile(self.stats))

    def reset_episode_vars(self):
        self.last_state = None
        self.last_action = None
        self.total_reward = 0.0
        self.count = 0

    def step(self, state, reward, done):

        state = self.preprocess_state(state)
        #choose an action
        action = self.act(state)

        # Save experience / reward
        if self.last_state is not None and self.last_action is not None:
            self.total_reward += reward
            self.count += 1
            self.memory.add_experience(state, action, reward, self.last_state, done)

        # Learn, if at end of episode
        if self.memory.len() > self.batch_size:
            experiences = self.memory.sample(self.batch_size)
            self.learn(experiences)

            self.episode_no += 1
        if done:
            if self.save_weights_every and self.episode_no % self.save_weights_every == 0:
                self.actor_local.model.save_weights(self.actor_filename)
                self.critic_local.model.save_weights(self.critic_filename)
                print("Model weights saved at episode", self.episode_no)
            self.write([self.episode_no, self.total_reward])
            self.reset_episode_vars()

        self.last_state = state
        self.last_action = action
        return self.postprocess_action(action)

    def act(self, state):
        # Choose action based on given state and policy
        states = np.reshape(state, [-1, self.state_size])
        actions = self.actor_local.predict(states)

        return actions + self.noise.sample()

    def learn(self, experiences):
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences if e is not None]).astype(
                np.float32).reshape(-1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None]).astype(
                np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences if e is not None]).astype(
                np.uint8).reshape(-1, 1)
        next_states = np.vstack([e.state_next for e in experiences if e is not None])

        # Get predicted next states and Q values from target models
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
                [next_states, actions_next])

        #compute Q targets 
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets)

        #train actor model
        action_gradients = np.reshape(self.critic_local.get_action_gradients(
            [states, actions, 0]),(-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients, 1])

        #update
        self.soft_update(self.critic_local, self.critic_target)
        self.soft_update(self.actor_local, self.actor_target)


    def soft_update(self, local_model, target_model):
        '''update model params'''
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        new_weights = self.tau * local_weights + (1 - self.tau) * target_weights
        target_model.update_weights(new_weights)
Пример #7
0
class BaseAgent:
    """Advanced base agent that lets you limit the action and state space"""
    def __init__(self, task, action_min, action_max, state_min, state_max):
        # Task (environment) information
        self.task = task  # should contain observation_space and action_space
        
        self.min_action = action_min # define minimum and maximum action
        self.max_action = action_max
        
        self.min_stat = state_min # define minimum and maximum state
        self.max_stat = state_max
        
        self.learn_when_done = False # defines if the agent shall only learn at the end of each episode

        # Constrain state and action spaces
        self.state_size = self.max_stat-self.min_stat+1  # position only
        self.action_size = self.max_action-self.min_action+1  # force only
        print("Original spaces: {}, {}\nConstrained spaces: {}, {}".format(
            self.task.observation_space.shape, self.task.action_space.shape,
            self.state_size, self.action_size))
        
        # calc state space minimum and range
        self.state_low = self.task.observation_space.low[self.min_stat:self.max_stat+1]
        self.state_range = self.task.observation_space.high[self.min_stat:self.max_stat+1] - self.state_low
        # self.action_size = np.prod(self.task.action_space.shape)
        
        # calc action space minimum, maximum and range
        self.action_low = self.task.action_space.low[self.min_action:self.max_action+1]
        self.action_high = self.task.action_space.high[self.min_action:self.max_action+1]
        self.action_range = self.action_high-self.action_low

        # Replay memory
        self.epsilon = 0.0
        self.batch_size = 64
        self.buffer_size = 100000
        self.memory = ReplayBuffer(self.buffer_size)

        # Save episode stats
        self.stats_filename = os.path.join(
            util.get_param('out'),
            "stats_{}.csv".format(util.get_timestamp()))  # path to CSV file
        self.stats_columns = ['episode', 'total_reward', 'learning_rate']  # specify columns to save
        self.episode_num = 1
        print("Saving stats {} to {}".format(self.stats_columns, self.stats_filename))  # [debug]        

        # Episode variables
        self.reset_episode_vars()
        
    def reset_episode_vars(self):
        """Reset current episode's stats"""
        self.last_state = None
        self.org_last_state = None
        self.last_action = None
        self.total_reward = 0.0
        self.count = 0
        
    def preprocess_state(self, state):
        """Reduce state vector to relevant dimensions."""
        return state[self.min_stat:self.max_stat+1]  # limit to desired state range

    def postprocess_action(self, action):
        """Return complete action vector."""
        complete_action = np.zeros(self.task.action_space.shape)  # shape: (6,)
        complete_action[self.min_action:self.max_action+1] = action  # extend to original size again
        return complete_action        
    
    def handle_step_index(self, done):
        """Is called once each turn for periodic events"""
        pass

    def step(self, state, reward, done):
        """Handles a single step:
           - Convert input state to simpler one
           - Estimate best action
           - Learn all x rounds
           - Write stats to log
           - Convert internal to external action and return it"""
        
        org_state = state;
                
        # print("Shape: {}".format(state.shape))
        # Transform state vector
        state = self.preprocess_state(state)

        # print("PP Shape: {}".format(state.shape))

        # print("{} {} {}".format(state.shape, self.state_low.shape, self.state_range.shape))
        state = (state - self.state_low) / self.state_range  # scale to [0.0, 1.0]
        state = state.reshape(1, -1)  # convert to row vector
        
        # Choose an action
        action = self.act(state)
        
        # Save experience / reward
        if self.last_state is not None and self.last_action is not None:
            # print("Action shape {}".format(self.last_action.shape))
            
            if len(self.memory)==self.batch_size-1:
                print("Buffer filled, starting learning")
            self.memory.add(self.last_state, self.last_action, reward, state, done)
        
        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size and (self.learn_when_done==False or done==True):
            experiences = self.memory.sample(self.batch_size)
            self.learn(experiences)
        
        # Sum rewards
        if self.last_state is not None and self.last_action is not None:
            self.total_reward += reward
            self.count += 1
            
        # convert action from restricted to full space again
        pp_action = self.postprocess_action(action)

        # Learn, if at end of episode
        if done:
            # Write episode stats
            self.write_stats([self.episode_num, self.total_reward, self.epsilon])
            print("Reward: {} Exploration rate: {}".format(self.total_reward, self.epsilon))
            self.episode_num += 1
            self.reset_episode_vars()
            
        # remember this round's data
        self.last_state = state
        self.org_last_state = org_state
        self.last_action = action                
        
        # notify high level handler
        self.handle_step_index(done)
        
        return pp_action
        
    def write_stats(self, stats):
        """Write single episode stats to CSV file."""
        df_stats = pd.DataFrame([stats], columns=self.stats_columns)  # single-row dataframe
        df_stats.to_csv(self.stats_filename, mode='a', index=False,
            header=not os.path.isfile(self.stats_filename))  # write header first time only                

    def act(self, states):
        """Returns actions for given state(s) as per current policy."""
        pass

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        pass
Пример #8
0
class DDPG(BaseAgent):
    '''Agent that searches for optimal policy using Deep Deterministic Policy Gradients.'''

    def __init__(self, task):
        '''
        Initializes variables

        :param task: Should be able to access the following (OpenAI Gym spaces):
            task.observation_space  # i.e. state space
            task.action_space
        '''
        super(DDPG, self).__init__(task)

        self.use_gpu = torch.cuda.is_available()
        self.task = task

        # Hyperparameters
        self.gamma = 0.99 # discount factor
        self.tau = 0.001 # for sort update of target parameters

        # constrained states
        self.state_size = np.prod(self.task.observation_space.shape).item()

        # constrained actions
        self.action_size = 1
        self.action_low = self.task.action_space.low[2:3]
        self.action_high = self.task.action_space.high[2:3]

        # Actor model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high,
                                 self.use_gpu)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high,
                                  self.use_gpu)
        self.actor_optimizer = torch.optim.Adam(self.actor_local.parameters(), 1e-4)

        # Critic model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)
        self.critic_optimizer = torch.optim.Adam(self.critic_local.parameters(), 1e-3)

        # load the models and sync weights target models
        self.best_model_loaded = self.load_models(self.actor_local, self.critic_local)
        self.hard_update(self.actor_target, self.actor_local)
        self.hard_update(self.critic_target, self.critic_local)

        print('Best model loaded: {}'.format(self.best_model_loaded))

        # use GPU?
        if self.use_gpu:
            self.actor_local.cuda()
            self.actor_target.cuda()
            self.critic_local.cuda()
            self.critic_target.cuda()

        # Ornstein-Uhlenbeck noise for action sampling
        self.noise = OrnsteinUhlenbeckProcess(
            size=self.action_size, theta=0.15, sigma=0.02)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 128
        self.memory = ReplayBuffer(self.buffer_size)

        # Score tracker and learning parameters
        self.best_score = -np.inf

        # Episode variables
        self.last_state = None
        self.last_action = None
        self.total_reward = 0.0
        self.count = 0
        self.episode_num = 1
        self.acts = np.zeros(shape=self.task.action_space.shape) # actions to reuturn from step()
                                                                 # we set all actions to 0
                                                                 # except one for vertical forces

    def reset_episode_vars(self):
        '''Resets episode variables'''
        self.last_state = None
        self.last_action = None
        self.total_reward = 0.0
        self.count = 0
        self.episode_num += 1
        self.acts = np.zeros(shape=self.task.action_space.shape)

    def step(self, state, reward, done):
        '''Process state, reward, done flag, and return an action.

        :param state: current state vector as Numpy array, compatible with task's state space
        :param reward: last reward received
        :param done: whether this episode is complete

        :return: desired action vector as NumPy array, compatible with task's action space
        '''
        # Choose an action
        state = state[0:self.state_size]
        action = self.act(state)

        # Save experience / reward
        if self.last_state is not None and self.last_action is not None:
            self.memory.add(self.last_state, self.last_action, reward, state, done)
            self.total_reward += reward
            self.count += 1

        # Learn, if we have enough samples
        if len(self.memory) > self.batch_size and not self.best_model_loaded:
            experience = self.memory.sample(self.batch_size)
            self.learn(experience)

        # Write statistic and saves model if done
        if done:
            score = self.total_reward / float(self.count) if self.count else 0.0
            if score > self.best_score:
                self.best_score = score
                self.save_models(self.episode_num, self.actor_target, self.critic_target, True)

            print("DDPG.learn(): t = {:4d}, score = {:7.3f} (best = {:7.3f}), total reward = {:7.3f}, episode = {}".format(
                  self.count, score, self.best_score, self.total_reward, self.episode_num))
            
            if self.episode_num % 10 == 0:
                self.save_models(self.episode_num, self.actor_target, self.critic_target, False)

            self.write_episode_stats(self.episode_num, self.total_reward)
            self.reset_episode_vars()

        # saves last state and actions
        self.last_state = state
        self.last_action = action
        self.acts[2] = action # change only vertical forces

        return self.acts

    def act(self, state):
        '''
        Predict actions for a state

        :param state: Numpy array, environment state
        :return: Numpy array, predicted actions
        '''
        state = self.to_var(torch.from_numpy(state).float())

        self.actor_local.eval()
        action = self.actor_local.forward(state).detach()
        return action.data.cpu().numpy() + self.noise.sample()

    def to_var(self, x_numpy):
        '''
        Helper to convert Numpy array to PyTorch tensor

        :param x_numpy: Numpy array to convert
        :return: PyTorch tensor
        '''
        x_var = Variable(x_numpy)
        if self.use_gpu:
            x_var = x_var.cuda()
        return x_var

    def learn(self, experiences):
        '''
        Trains the networks

        :param experiences: tuple of the experience - (states, actions, rewards, next_states, dones)
        '''
        # -------------------- get data from batch --------------------
        # get expereiences from the replay buffer
        states = np.vstack(experiences[0])
        states = self.to_var(torch.from_numpy(states).float())

        actions = np.vstack(experiences[1])
        actions = self.to_var(torch.from_numpy(actions).float())

        rewards = np.float32(experiences[2])
        rewards = self.to_var(torch.from_numpy(rewards))
        rewards = torch.unsqueeze(rewards, 1)

        next_states = np.vstack(experiences[3])
        next_states = self.to_var(torch.from_numpy(next_states).float())

        dones = np.float32(experiences[4])
        not_dones = self.to_var(torch.from_numpy(1 - dones))
        not_dones = torch.unsqueeze(not_dones, 1)

        # ---------------------- optimize critic ----------------------
        next_actions = self.actor_target.forward(next_states).detach()
        Q_targets_next = self.critic_target.forward(next_states, next_actions).detach()

        Q_targets_next = not_dones * Q_targets_next
        Q_targets = rewards + (self.gamma * Q_targets_next)

        Q_predicted = self.critic_local.forward(states, actions)

        # compute critic model loss and train it
        value_loss = nn.SmoothL1Loss()(Q_predicted, Q_targets)

        self.critic_local.zero_grad()
        self.critic_optimizer.zero_grad()
        value_loss.backward()
        self.critic_optimizer.step()

        # ---------------------- optimize actor -----------------------
        predicted_actions = self.actor_local.forward(states)
        policy_loss = torch.mean(-self.critic_local.forward(states, predicted_actions))

        self.actor_local.zero_grad()
        self.actor_optimizer.zero_grad()
        policy_loss.backward()
        self.actor_optimizer.step()

        # soft update of target models
        self.soft_update(self.actor_target, self.actor_local)
        self.soft_update(self.critic_target, self.critic_local)

    def hard_update(self, target_model, local_model):
        '''
        Hard update of the target model weights - just copy them from the local model

        :param target_model: Destination, target model
        :param local_model: Source, local model
        '''
        for target_param, param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(param.data)

    def soft_update(self, target_model, local_model):
        '''
        Soft update of the target model weights corresponding to DDPG algorithm

        :param target_model: Destination, target model
        :param local_model: Source, local model
        '''
        for target_param, param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)
Пример #9
0
class BaseAgentDDPG(BaseAgent):
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task):

        self.task = task

        # Load/save parameters
        self.load_weights = False  # try to load weights from previously saved models
        self.save_weights_every = None # save weights every n episodes, None to disable
        self.model_dir = util.get_param(
            'out')  # you can use a separate subdirectory for each task and/or neural net architecture
        self.model_name = "ddpg-{}".format(self.task.__class__.__name__)
        self.model_ext = ".h5"

        # Save episode stats
        self.stats_filename = os.path.join(
            util.get_param('out'),
            "stats_{}_{}.csv".format(self.model_name, util.get_timestamp()))  # path to CSV file
        self.stats_columns = ['episode', 'total_reward']  # specify columns to save

        print("Saving stats {} to {}".format(self.stats_columns, self.stats_filename))  # [debug]

        # Constrain state and action spaces
        self.state_start = 2
        self.state_end = 3
        self.action_start = 2
        self.action_end = 3

        # Noise process
        self.theta = 0.15
        self.sigma = 0.3

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.005  # for soft update of target parameters

        # Episode variables
        self.episode = 0
        self.episode_duration = 0
        self.total_reward = 0
        self.last_state = None
        self.last_action = None
        self.reset_episode_vars()

        # override params in child classes
        self.init_params()

        self.state_size = self.state_end - self.state_start
        self.action_size = self.action_end - self.action_start
        self.action_low = self.task.action_space.low[self.action_start:self.action_end]
        self.action_high = self.task.action_space.high[self.action_start:self.action_end]
        self.noise = OrnsteinUhlenbeckProcess(size=self.action_size, theta=self.theta, sigma=self.sigma)


        # Actor (Policy) Model
        self.actor_learning_rate = 0.0001
        self.actor_local = None
        self.actor_target = None
        self.init_actor_models()

        # Critic (Value) Model
        self.critic_learning_rate = 0.001
        self.critic_local = None
        self.critic_target = None
        self.init_critic_models()

        # Load pre-trained model weights, if available
        if self.load_weights and os.path.isfile(self.actor_filename):
            self.load_weights_from_file()

        if self.save_weights_every:
            print("Saving model weights", "every {} episodes".format(
                self.save_weights_every) if self.save_weights_every else "disabled")  # [debug]

        print("Original spaces: {}, {}\nConstrained spaces: {}, {}".format(
            self.task.observation_space.shape, self.task.action_space.shape,
            self.state_size, self.action_size))

        if self.load_weights or self.save_weights_every:
            self.actor_filename = os.path.join(self.model_dir,
                                               "{}_actor{}".format(self.model_name, self.model_ext))
            self.critic_filename = os.path.join(self.model_dir,
                                                "{}_critic{}".format(self.model_name, self.model_ext))
            print("Actor filename :", self.actor_filename)  # [debug]
            print("Critic filename:", self.critic_filename)  # [debug]


    def reset_episode_vars(self):
        self.total_reward = 0
        self.episode_duration = 0
        self.last_state = None
        self.last_action = None

    def preprocess_state(self, state):
        """Reduce state vector to relevant dimensions."""
        return state[self.state_start:self.state_end]  # position only

    def postprocess_action(self, action):
        """Return complete action vector."""
        complete_action = np.zeros(self.task.action_space.shape)  # shape: (6,)
        complete_action[self.action_start:self.action_end] = action  # linear force only
        return complete_action

    def write_stats(self, stats):
        """Write single episode stats to CSV file."""
        df_stats = pd.DataFrame([stats], columns=self.stats_columns)  # single-row dataframe
        df_stats.to_csv(self.stats_filename, mode='a', index=False,
                        header=not os.path.isfile(self.stats_filename))  # write header first time only

    def step(self, state, reward, done):

        state = self.preprocess_state(state)

        self.total_reward += reward

        # Choose an action
        action = self.act(state)
        self.episode_duration += 1
        # Save experience / reward
        if self.last_state is not None and self.last_action is not None:
            self.memory.add(self.last_state, self.last_action, reward, state, done)

        self.last_state = state
        self.last_action = action

        if done:
            # Write episode stats
            self.write_stats([self.episode, self.total_reward])
            print('episode={}, reward={:8.3f}, duration={}'.format(self.episode,self.total_reward, self.episode_duration))
            # Save model weights at regular intervals
            if self.save_weights_every and self.episode % self.save_weights_every == 0:
                self.save_weights()
            self.episode += 1
            self.reset_episode_vars()

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample(self.batch_size)
            self.learn(experiences)

        return self.postprocess_action(action)

    def act(self, states):
        """Returns actions for given state(s) as per current policy."""
        states = np.reshape(states, [-1, self.state_size])
        actions = self.predict_actions(states)
        return actions + self.noise.sample()  # add some noise for exploration

    def soft_update(self, local_model, target_model):
        raise NotImplementedError("{} must override soft_update()".format(self.__class__.__name__))

    def init_params(self):
        raise NotImplementedError("{} must override init_params()".format(self.__class__.__name__))

    def init_actor_models(self):
        raise NotImplementedError("{} must override init_actor_models()".format(self.__class__.__name__))

    def init_critic_models(self):
        raise NotImplementedError("{} must override init_critic_models()".format(self.__class__.__name__))

    def load_weights_from_file(self):
        raise NotImplementedError("{} must override load_weights_from_file()".format(self.__class__.__name__))

    def save_weights(self):
        raise NotImplementedError("{} must override save_weights()".format(self.__class__.__name__))

    def predict_actions(self, states):
        raise NotImplementedError("{} must override predict_actions(states)".format(self.__class__.__name__))

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        raise NotImplementedError("{} must override learn(experiences)".format(self.__class__.__name__))