Пример #1
0
class myDDPG(BaseAgent):
    def __init__(self, task):

        self.task = task
        self.state_size = np.prod(self.task.observation_space.shape)
        self.action_size = np.prod(self.task.action_space.shape)

        # 限制状态和动作空间
        self.limit_state_size = 3
        self.limit_action_size = 1
        self.action_low = self.task.action_space.low[2]
        self.action_high = self.task.action_space.high[2]

        # 设定上个动作和状态的初始值:a(t-1)和s(t-1)
        self.last_state = None
        self.last_action = None

        # 设定Actor和Critic的local_model和target_model
        self.local_actor = Actor(self.limit_state_size, self.limit_action_size,
                                 self.action_low, self.action_high)
        self.local_critic = Critic(self.limit_state_size,
                                   self.limit_action_size)
        self.target_actor = Actor(self.limit_state_size,
                                  self.limit_action_size, self.action_low,
                                  self.action_high)
        self.target_critic = Critic(self.limit_state_size,
                                    self.limit_action_size)
        # 设定超参数
        self.batch_size = 150
        self.buffer_size = 10000
        self.soft_params = 0.005
        self.gamma = 0.99
        # 设定缓存区
        self.memory = ReplayBuffer(self.buffer_size)
        # 设定随机探索噪点
        self.noise = OUNoise(self.limit_action_size)

    """ 接收task传来的上一个动作a(t-1)引发的R(t)和S(t),选择当前动作a(t),并通过学习优化策略"""

    def step(self, state, reward, done):

        state = self.preprocess_state(state)

        # 根据当前的状态s(t)选择动作a(t)
        action = self.act(state)

        # 把经验元组存储到缓冲区
        if self.last_state is not None and self.last_action is not None:
            self.memory.add(self.last_state, self.last_action, reward, state,
                            done)
        self.last_state = state
        self.last_action = action

        # 当 memory 中有足够的经验,从中批量取样进行学习
        memorySize = self.memory.__len__()
        if memorySize >= self.batch_size:
            expriences = self.memory.sample(batch_size=self.batch_size)
            self.learn(expriences)

        # 最后返回完整的动作向量
        complete_action = self.postprocess_action(action)
        return complete_action

    def act(self, state):
        # 调用Actor,把当前状态(向量)作为输入,得到根据当前策略所选择的动作(向量)
        input_state = np.array(state)
        # 加入随机噪点
        action = self.local_actor.model.predict(
            input_state) + self.noise.sample()
        return action.astype(np.float32)

    def learn(self, expriences):
        # 对入参expriences进行处理,拆分出states,actions,rewards,dones,next_states
        states, actions, rewards, dones, next_states = [], [], [], [], []
        for ex in expriences:
            if ex is not None:
                states.append(ex.state)
                actions.append(ex.action)
                rewards.append(ex.reward)
                next_states.append(ex.next_state)
                dones.append(ex.done)
        # 转换数据格式
        states, actions, rewards, dones, next_states = np.array(states).astype(np.float32), np.array(actions).astype(
            np.float32) \
            , np.array(rewards).astype(np.float32), np.array(dones).astype(np.uint8), np.array(next_states).astype(
            np.float32)
        """训练 Critic,更新模型参数 (用target_model作为标签训练local_model)"""

        # 用批量 s(t+1) 输入target_actor预测得到批量 a(t+1)
        input_next_states = np.reshape(
            next_states, (len(next_states), self.limit_state_size))
        next_actions = self.target_actor.model.predict(
            input_next_states).astype(np.float32)

        # 批量s(t+1)和a(t+1)作为输入,通过target_critic 得到Q(s(t+1),a(t+1))
        input_next_actions = np.reshape(
            next_actions,
            (len(next_actions), self.limit_action_size)).astype(np.float32)
        Q_sa = self.target_critic.model.predict(
            [input_next_states, input_next_actions]).astype(np.float32)
        # 计算得到Q_targets
        Q_targets = Q_sa * self.gamma * (1 - np.reshape(dones, (-1, 1)).astype(
            np.uint8)) + np.reshape(rewards,
                                    (len(rewards), 1)).astype(np.float32)

        # 用Q_targets做标签,训练local_critic模型参数
        input_states = np.reshape(states, (len(states), self.limit_state_size))
        input_actions = np.reshape(
            actions, (len(actions), self.limit_action_size)).astype(np.float32)
        self.local_critic.model.fit([input_states, input_actions], Q_targets)
        """ 训练 Actor,更新模型参数"""

        # 由 local_critic 得到策略参数梯度 action_gradients
        g_a = self.local_critic.get_action_gradients(
            inputs=[input_states, input_actions, 0])
        action_gradient = np.reshape(g_a, (-1, self.limit_action_size))
        # 用action_gradients训练local_actor模型
        self.local_actor.train_fn(inputs=[input_states, action_gradient, 1])
        """ soft_update """
        self.soft_update(self.local_critic, self.target_critic)
        self.soft_update(self.local_actor, self.target_actor)

    """限制状态空间,把task传来的8维状态向量降到3维(z,vel,linear_acceleration.z)"""

    def preprocess_state(self, raw_state):
        state = np.array([raw_state])
        return state[:, 2:5]

    """把1维动作向量扩展到6维(剩下维度都补零),以供返回给task"""

    def postprocess_action(self, action):
        complete_action = np.zeros((1, self.action_size))  # shape: (6,)
        complete_action[:, 2] = action
        return complete_action[0]

    """软更新,用local模型的权重更新target模型权重"""

    def soft_update(self, local, target):
        local_weights = np.array(local.model.get_weights())
        target_weights = np.array(target.model.get_weights())

        new_weights = self.soft_params * local_weights + (
            1 - self.soft_params) * target_weights
        target.model.set_weights(new_weights)
Пример #2
0
class DDPG_combined(BaseAgent):
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task):

        # Load/save parameters
        self.load_weights = True  # try to load weights from previously saved models
        self.save_weights_every = 100  # save weights every n episodes, None to disable
        self.model_dir = util.get_param('out')  # you can use a separate subdirectory for each task and/or neural net architecture
        self.model_name = "{}-model".format(task.taskname)
        self.model_ext = ".h5"
        if self.load_weights or self.save_weights_every:
            self.actor_filename = os.path.join(self.model_dir,
                "{}_actor{}".format(self.model_name, self.model_ext))
            self.critic_filename = os.path.join(self.model_dir,
                "{}_critic{}".format(self.model_name, self.model_ext))
            print("Actor filename :", self.actor_filename)  # [debug]
            print("Critic filename:", self.critic_filename)  # [debug]

        # Task (environment) information
        self.task = task  # should contain observation_space and action_space
        self.state_size = 3 # z position, velocity and time elapsed(sec)
        self.state_range = self.task.observation_space.high[2] - self.task.observation_space.low[2]
        self.action_size = 1 # it seems only z linear force is needed for all the tasks
        self.action_range = self.task.action_space.high[2] - self.task.action_space.low[2]

        # Actor (Policy) Model
        self.action_low = self.task.action_space.low[2]
        self.action_high = self.task.action_space.high[2]
        self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Load pre-trained model weights, if available
        if self.load_weights and os.path.isfile(self.actor_filename):
            try:
                self.actor_local.model.load_weights(self.actor_filename)
                self.critic_local.model.load_weights(self.critic_filename)
                print("Model weights loaded from file!")  # [debug]
            except Exception as e:
                print("Unable to load model weights from file!")
                print("{}: {}".format(e.__class__.__name__, str(e)))

        if self.save_weights_every:
            print("Saving model weights", "every {} episodes".format(
                self.save_weights_every) if self.save_weights_every else "disabled")  # [debug]

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(self.actor_local.model.get_weights())

        # Noise process
        self.noise = OUNoise(self.action_size)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.001  # for soft update of target parameters

        # Episode variables
        self.episode = 0
        self.reset_episode_vars()

        # Save episode stats
        self.stats_filename = os.path.join(
            util.get_param('out'),
            "stats_{}.csv".format(util.get_timestamp()))  # path to CSV file
        self.stats_columns = ['episode', 'total_reward']  # specify columns to save
        print("Saving stats {} to {}".format(self.stats_columns, self.stats_filename))  # [debug]

    def write_stats(self, stats):
        """Write single episode stats to CSV file."""
        df_stats = pd.DataFrame([stats], columns=self.stats_columns)  # single-row dataframe
        df_stats.to_csv(self.stats_filename, mode='a', index=False,
            header=not os.path.isfile(self.stats_filename))  # write header first time only

    def reset_episode_vars(self):
        self.episode += 1
        self.last_state = None
        self.last_action = None
        self.total_reward = 0.0

    def step(self, state, reward, done):
        # Transform state vector
        state = state.reshape(1, -1)  # convert to row vector

        # Choose an action
        action = self.act(state)

        # Save experience / reward
        if self.last_state is not None and self.last_action is not None:
            self.memory.add(self.last_state, self.last_action, reward, state, done)
            self.total_reward += reward

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample(self.batch_size)
            self.learn(experiences)

        self.last_state = state
        self.last_action = action

        if done:
            # Write episode stats
            self.write_stats([self.episode, self.total_reward])
            print('Total reward: {}'.format(self.total_reward))

            # Save model weights at regular intervals
            if self.save_weights_every and self.episode % self.save_weights_every == 0:
                self.actor_local.model.save_weights(self.actor_filename)
                self.critic_local.model.save_weights(self.critic_filename)
                print("Model weights saved at episode", self.episode)  # [debug]
            self.reset_episode_vars()

        return action

    def act(self, states):
        """Returns actions for given state(s) as per current policy."""
        states = np.reshape(states, [-1, self.state_size])
        actions = self.actor_local.model.predict(states)
        noise = 1. / (self.episode + 1)
        #return actions + self.noise.sample()  # add some noise for exploration
        return actions + noise

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape(-1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack([e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch([next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients, 1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        new_weights = self.tau * local_weights + (1 - self.tau) * target_weights
        target_model.set_weights(new_weights)