Python Critic.action_gradients示例

class DDPG_REC:

    def __init__(self, state_item_num, action_item_num, emb_dim, batch_size, tau, actor_lr, critic_lr,
                 gamma, buffer_size, item_space, summary_dir):

        self.state_item_num = state_item_num
        self.action_item_num = action_item_num
        self.emb_dim = emb_dim
        self.batch_size = batch_size
        self.tau = tau
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.gamma = gamma
        self.buffer_size = buffer_size
        self.item_space = item_space
        self.summary_dir = summary_dir

        self.sess = tf.Session()

        self.s_dim = emb_dim * state_item_num
        self.a_dim = emb_dim * action_item_num
        self.actor = Actor(self.sess, state_item_num, action_item_num, emb_dim, batch_size, tau, actor_lr)
        self.critic = Critic(self.sess, state_item_num, action_item_num, emb_dim,
                             self.actor.get_num_trainable_vars(), gamma, tau, critic_lr)
        self.exploration_noise = OUNoise(self.a_dim)

        # set up summary operators
        self.summary_ops, self.summary_vars = self.build_summaries()
        self.sess.run(tf.global_variables_initializer())
        self.writer = tf.summary.FileWriter(summary_dir, self.sess.graph)

        # initialize target network weights
        self.actor.hard_update_target_network()
        self.critic.hard_update_target_network()

        # initialize replay memory
        self.replay_buffer = ReplayBuffer(buffer_size)

    def gene_actions(self, weight_batch):
        """use output of actor network to calculate action list
        Args:
            weight_batch: actor network outputs

        Returns:
            recommendation list
        """
        item_ids = list(self.item_space.keys())
        item_weights = list(self.item_space.values())
        max_ids = list()
        for weight in weight_batch:
            score = np.dot(item_weights, np.transpose(weight))
            idx = np.argmax(score, 0)
            max_ids.append([item_ids[_] for _ in idx])
        return max_ids

    # def gene_action(self, weight):
    #     """use output of actor network to calculate action list
    #     Args:
    #         weight: actor network outputs
    #
    #     Returns:
    #         recommendation list
    #     """
    #     item_ids = list(self.item_space.keys())
    #     item_weights = list(self.item_space.values())
    #     score = np.dot(item_weights, np.transpose(weight))
    #     idx = np.argmax(score)
    #     return item_ids[idx]

    @staticmethod
    def build_summaries():
        episode_reward = tf.Variable(0.)
        tf.summary.scalar("reward", episode_reward)
        episode_max_q = tf.Variable(0.)
        tf.summary.scalar("max_q_value", episode_max_q)
        critic_loss = tf.Variable(0.)
        tf.summary.scalar("critic_loss", critic_loss)

        summary_vars = [episode_reward, episode_max_q, critic_loss]
        summary_ops = tf.summary.merge_all()
        return summary_ops, summary_vars

    def _train(self):
        samples = self.replay_buffer.sample_batch(self.batch_size)
        state_batch = np.asarray([_[0] for _ in samples])
        action_batch = np.asarray([_[1] for _ in samples])
        reward_batch = np.asarray([_[2] for _ in samples])
        n_state_batch = np.asarray([_[3] for _ in samples])
        done_batch = np.asarray([_[4] for _ in samples])

        seq_len_batch = np.asarray([self.state_item_num] * self.batch_size)

        # calculate predicted q value
        action_weights = self.actor.predict_target(state_batch, seq_len_batch)  # [batch_size,
        n_action_batch = self.gene_actions(action_weights.reshape((-1, self.action_item_num, self.emb_dim)))
        n_action_emb_batch = get_item_emb(n_action_batch, item_ids_emb_dict)
        target_q_batch = self.critic.predict_target(n_state_batch.reshape((-1, self.s_dim)),
                                                    n_action_emb_batch.reshape((-1, self.a_dim)), seq_len_batch)
        y_batch = []
        for i in range(self.batch_size):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + self.critic.gamma * target_q_batch[i])

        # train critic
        q_value, critic_loss, _ = self.critic.train(state_batch, action_batch,
                                                    np.reshape(y_batch, (self.batch_size, 1)), seq_len_batch)

        # train actor
        action_weight_batch_for_gradients = self.actor.predict(state_batch, seq_len_batch)
        action_batch_for_gradients = self.gene_actions(action_weight_batch_for_gradients)
        action_emb_batch_for_gradients = get_item_emb(action_batch_for_gradients, item_ids_emb_dict)
        a_gradient_batch = self.critic.action_gradients(state_batch,
                                                        action_emb_batch_for_gradients.reshape((-1, self.a_dim)),
                                                        seq_len_batch)
        self.actor.train(state_batch, a_gradient_batch[0], seq_len_batch)

        # update target networks
        self.actor.update_target_network()
        self.critic.update_target_network()

        return np.amax(q_value), critic_loss

    def action(self, state):
        weight = self.actor.predict(np.reshape(state, [1, self.s_dim]), np.array([self.state_item_num])) + \
                 self.exploration_noise.noise().reshape(
                     (1, self.action_item_num, int(self.a_dim / self.action_item_num)))
        action = self.gene_actions(weight)
        return np.array(action[0])

    def perceive_and_train(self, state, action, reward, n_state, done):
        action_emb = get_item_emb(action, item_ids_emb_dict)
        self.replay_buffer.add(list(state.reshape((self.s_dim,))),
                               list(action_emb.reshape((self.a_dim,))),
                               [reward],
                               list(n_state.reshape((self.s_dim,))),
                               [done])

        # Store transitions to replay start size then start training
        ep_q_value_, critic_loss = 0, 0
        if self.replay_buffer.size() > self.batch_size:
            ep_q_value_, critic_loss = self._train()

        # if self.time_step % 10000 == 0:
        # self.actor_network.save_network(self.time_step)
        # self.critic_network.save_network(self.time_step)

        # Re-iniitialize the random process when an episode ends
        if done:
            self.exploration_noise.reset()

        return ep_q_value_, critic_loss

    def write_summary(self, ep_reward, ep_q_value, loss, i):
        summary_str = self.sess.run(self.summary_ops, feed_dict={self.summary_vars[0]: ep_reward,
                                                                 self.summary_vars[1]: ep_q_value,
                                                                 self.summary_vars[2]: loss})
        self.writer.add_summary(summary_str, i)

    def save(self):
        self.writer.close()
        saver = tf.train.Saver()
        ckpt_path = os.path.join(os.path.dirname(__file__), "models")
        saver.save(self.sess, ckpt_path, write_meta_graph=False)

示例#2

显示文件

class Agent:
    def __init__(self, experiment, batch_size):
        self._dummy_env = gym.make(experiment)
        self._sess = tf.Session()

        self._sum_writer = tf.summary.FileWriter('logs/', self._sess.graph)

        # Hardcoded for now
        self._dim_state = 25
        self._dim_goal = 3
        self._dim_action = self._dummy_env.action_space.shape[0]
        self._dim_env = 1
        self._batch_size = batch_size

        # agent noise
        self._action_noise = OrnsteinUhlenbeckActionNoise(
            mu=np.zeros(self._dim_action))

        self._actor = Actor(self._sess, self._dim_state, self._dim_goal,
                            self._dim_action, self._dummy_env, TAU,
                            LEARNING_RATE, self._batch_size)

        self._critic = Critic(self._sess, self._dim_state, self._dim_goal,
                              self._dim_action, self._dim_env, self._dummy_env,
                              TAU, LEARNING_RATE,
                              self._actor.get_num_trainable_vars(),
                              self._sum_writer)

        self._saver = tf.train.Saver(max_to_keep=None)

        self._sess.run(tf.global_variables_initializer())

        self._actor.initialize_target_network()
        self._critic.initialize_target_network()

        # training monitoring
        self._success_rate = tf.Variable(0., name="success_rate")
        self._python_success_rate = tf.placeholder("float32", [])

        self._update_success_rate = self._success_rate.assign(
            self._python_success_rate)
        self._merged = tf.summary.scalar("successrate",
                                         self._update_success_rate)
        #self._merged = tf.summary.merge(s)

        #writer = tf.summary.FileWriter('logs/')
        #writer.add_summary(
        #writer.add_graph(tf.get_default_graph())
        #writer.flush()
        #
    def get_dim_state(self):
        return self._dim_state

    def get_dim_action(self):
        return self._dim_action

    def get_dim_env(self):
        return self._dim_env

    def get_dim_goal(self):
        return self._dim_goal

    def evaluate_actor(self, actor_predict, obs, goal, history):

        assert (
            history.shape[0] == MAX_STEPS), "history must be of size MAX_STEPS"

        obs = obs.reshape(1, self._dim_state)
        goal = goal.reshape(1, self._dim_goal)
        history = history.reshape(1, history.shape[0], history.shape[1])

        return actor_predict(obs, goal, history)

    def evaluate_actor_batch(self, actor_predict, obs, goal, history):

        return actor_predict(obs, goal, history)

    def evaluate_critic(self, critic_predict, obs, action, goal, history, env):
        obs = obs.reshape(1, self._dim_state)
        goal = goal.reshape(1, self._dim_goal)
        action = action.reshape(1, self._dim_action)
        history = history.reshape(1, history.shape[0], history.shape[1])
        env = env.reshape(1, self._dim_env)

        return critic_predict(env, obs, goal, action, history)

    def evaluate_critic_batch(self, critic_predict, obs, action, goal, history,
                              env):
        return critic_predict(env, obs, goal, action, history)

    def train_critic(self, obs, action, goal, history, env, predicted_q_value):
        return self._critic.train(env, obs, goal, action, history,
                                  predicted_q_value)

    def train_actor(self, obs, goal, history, a_gradient):
        return self._actor.train(obs, goal, history, a_gradient)

    def action_gradients_critic(self, obs, action, goal, history, env):
        return self._critic.action_gradients(env, obs, goal, action, history)

    def update_target_actor(self):
        self._actor.update_target_network()

    def update_target_critic(self):
        self._critic.update_target_network()

    def action_noise(self):
        return self._action_noise()

    def update_success(self, success_rate, step):
        _, result = self._sess.run(
            [self._update_success_rate, self._merged],
            feed_dict={self._python_success_rate: success_rate})
        self._sum_writer.add_summary(result, step)

    def save_model(self, filename):
        self._saver.save(self._sess, filename)

    def load_model(self, filename):
        self._saver.restore(self._sess, filename)

示例#3

显示文件

class DDPG:
    def __init__(self, env, batch_size, mem_size, discount, actor_params,
                 critic_params):
        self._batch_size = batch_size
        self._mem_size = mem_size
        self._discount = discount
        self._sess = tensorflow.Session()
        k_backend.set_session(self._sess)
        self._env = env
        self._state_dim = env.observation_space.shape[0]
        self._action_dim = env.action_space.shape[0]
        self._action_min = env.action_space.low
        self._action_max = env.action_space.high
        self._state_min = env.observation_space.low
        self._state_max = env.observation_space.high
        self._actor = Actor(self._sess, self._state_dim, self._action_dim,
                            self._action_min, self._action_max, actor_params)
        self._critic = Critic(self._sess, 0.5, self._state_dim,
                              self._action_dim, critic_params)
        self._memory = ReplayBuffer(mem_size)

    def get_action(self, state):
        return self._actor._model.predict(state)

    def train(self):
        '''
        No training takes place until the replay buffer contains
        at least batch size number of experiences
        '''

        if (self._memory.size() > self._batch_size):
            self._train()

    def _train(self):
        states, actions, rewards, done, next_states = self._memory.sample(
            self._batch_size)
        self._train_critic(states, actions, rewards, done, next_states)
        action_gradients = self._critic.action_gradients(states, actions)
        self._actor.train(states, action_gradients)

    def q_estimate(self, state, action):
        return self._critic._model.predict(state, action)

    def _get_q_targets(self, next_states, done, rewards):
        '''
        q = r if done else =  r + gamma * qnext
        '''
        # use actor network to determine the next action under current policy
        # estimate Q values from the critic network

        actions = self.get_action(next_states)
        qnext = self.q_estimate(next_states, actions)

        q_targets = [
            reward if end else reward * self._discount * next_q
            for (reward, next_q, end) in zip(rewards, qnext, done)
        ]
        return q_targets

    def _train_critic(self, states, actions, rewards, done, next_states):
        q_targets = self._get_q_targets(next_states, done, rewards)
        self._critic.train(states, actions, q_targets)

    def experience(self, state, action, reward, done, next_state):
        # store in replay buffer
        self._memory.add(state, action, reward, done, next_state)

        self.train()