class ActorCritic(object):
    def __init__(self,
                 state_dim,
                 action_dim,
                 final_activation=tf.nn.tanh,
                 action_bound=0.4,
                 training_batch_size=32,
                 GAMMA=0.95,
                 lr=0.001,
                 replay_buffer_size=1024):
        self.ID = random_string(10)
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.final_activation = final_activation
        self.action_bound = action_bound
        self.GAMMA = GAMMA
        self.lr = lr
        self.replay_buffer_size = replay_buffer_size
        self.replay_buffer = ReplayBuffer(replay_buffer_size)
        self.training_batch_size = training_batch_size
        with tf.variable_scope(self.ID) as scope:
            self.actor = Actor(self.state_dim, self.action_dim,
                               self.action_bound, self.lr,
                               self.final_activation)
            self.critic = Critic(self.state_dim, self.action_dim, self.lr)

    def add_to_replay_buffer(self, state, action, reward, resulting_state):
        self.replay_buffer.add(state, action, reward, resulting_state)

    def add_batch_to_replay_buffer(self, states, actions, rewards,
                                   resulting_states):
        for s, a, r, rs in zip(states, actions, rewards, resulting_states):
            self.replay_buffer.add(s, a, r, rs)

    def get_batch(self, training_batch_size=None):
        if not training_batch_size:
            training_batch_size = self.training_batch_size
        return self.replay_buffer.sample_batch(training_batch_size)

    def train_from_replay_buffer(self, should_print=False):
        # small trouble: if it's done, you don't want to run this thing on it.
        # I takes the new state, I predict an action, I predict that pair's q val,
        # I do: reward + GAMMA*next_q_val. I then do critic.optimize_q_val
        if not self.replay_buffer.size():
            print('buffer empty!')
            return 0
        states, actions, rewards, resulting_states = self.replay_buffer.sample_batch(
            self.training_batch_size)
        predicted_action = self.actor.get_actions(resulting_states)
        predicted_vals = self.critic.predict_q_val(resulting_states,
                                                   predicted_action)
        true_vals = rewards + (self.GAMMA * predicted_vals)
        # print(true_vals[4])
        losses = self.critic.optimize_q_val(states, actions, true_vals)
        grads = self.critic.get_action_grads(states, actions)
        self.actor.train_from_batch(states, grads)
        return losses
        if should_print:
            actual_q, out = self.critic.return_q_and_out(
                states, actions, true_vals)
            print('ACTUAL_Q: {}\n\n'.format(actual_q))
            print('OUT: {}'.format(out))
        return losses

    def get_actions(self, states):
        return self.actor.get_actions(states)
示例#2
0
class DDPG:

    def __init__(self, sess, params):
        self.sess = sess
        self.__dict__.update(params)
        # create placeholders
        self.create_input_placeholders()
        # create actor/critic models
        self.actor = Actor(self.sess, self.inputs, **self.actor_params)
        self.critic = Critic(self.sess, self.inputs, **self.critic_params)
        self.noise_params = {k: np.array(list(map(float, v.split(","))))
                             for k, v in self.noise_params.items()}
        self.noise = Noise(**self.noise_params)
        self.ou_level = np.zeros(self.dimensions["u"])
        self.memory = Memory(self.n_mem_objects,
                             self.memory_size)

    def create_input_placeholders(self):
        self.inputs = {}
        with tf.name_scope("inputs"):
            for ip_name, dim in self.dimensions.items():
                self.inputs[ip_name] = tf.placeholder(tf.float32,
                                                      shape=(None, dim),
                                                      name=ip_name)
            self.inputs["g"] = tf.placeholder(tf.float32,
                                              shape=self.inputs["u"].shape,
                                              name="a_grad")
            self.inputs["p"] = tf.placeholder(tf.float32,
                                              shape=(None, 1),
                                              name="pred_q")

    def step(self, x, is_u_discrete, explore=True):
        x = x.reshape(-1, self.dimensions["x"])
        u = self.actor.predict(x)
        if explore:
            self.ou_level = self.noise.ornstein_uhlenbeck_level(self.ou_level)
            u = u + self.ou_level
        q = self.critic.predict(x, u)
        if is_u_discrete:
            return [np.argmax(u), u[0], q[0]]
        return [u[0], u, q[0]]

    def remember(self, experience):
        self.memory.add(experience)

    def train(self):
        # check if the memory contains enough experiences
        if self.memory.size < 3*self.b_size:
            return
        x, g, ag, u, r, nx, ng, t = self.get_batch()
        # for her transitions
        her_idxs = np.where(np.random.random(self.b_size) < 0.80)[0]
        # print("{} of {} selected for HER transitions".
        # format(len(her_idxs), self.b_size))
        g[her_idxs] = ag[her_idxs]
        r[her_idxs] = 1
        t[her_idxs] = 1
        x = np.hstack([x, g])
        nx = np.hstack([nx, ng])
        nu = self.actor.predict_target(nx)
        tq = r + self.gamma*self.critic.predict_target(nx, nu)*(1-t)
        self.critic.train(x, u, tq)
        grad = self.critic.get_action_grads(x, u)
        # print("Grads:\n", g)
        self.actor.train(x, grad)
        self.update_targets()

    def get_batch(self):
        return self.memory.sample(self.b_size)

    def update_targets(self):
        self.critic.update_target()
        self.actor.update_target()